Skip to content

Commit

Permalink
ARROW-16715: [R] Bump default parquet version (apache#13555)
Browse files Browse the repository at this point in the history
Also removes deprecated args to `write_parquet()`

Authored-by: Neal Richardson <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
nealrichardson authored Jul 11, 2022
1 parent 66c66d0 commit f0ff8d0
Show file tree
Hide file tree
Showing 9 changed files with 122 additions and 87 deletions.
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ importFrom(rlang,"%||%")
importFrom(rlang,":=")
importFrom(rlang,.data)
importFrom(rlang,abort)
importFrom(rlang,arg_match)
importFrom(rlang,as_function)
importFrom(rlang,as_label)
importFrom(rlang,as_quosure)
Expand Down
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
* `orders` with year, month, day, hours, minutes, and seconds components are supported.
* the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`).
* `read_arrow()` and `write_arrow()`, deprecated since 1.0.0 (July 2020), have been removed. Use the `read/write_feather()` and `read/write_ipc_stream()` functions depending on whether you're working with the Arrow IPC file or stream format, respectively.
* `write_parquet()` now defaults to writing Parquet format version 2.4 (was 1.0). Previously deprecated arguments `properties` and `arrow_properties` have been removed; if you need to deal with these lower-level properties objects directly, use `ParquetFileWriter`, which `write_parquet()` wraps.

# arrow 8.0.0

Expand Down
2 changes: 1 addition & 1 deletion r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
#' @importFrom rlang is_list call2 is_empty as_function as_label
#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match
#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
Expand Down
2 changes: 1 addition & 1 deletion r/R/enums.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ FileType <- enum("FileType",
#' @export
#' @rdname enums
ParquetVersionType <- enum("ParquetVersionType",
PARQUET_1_0 = 0L, PARQUET_2_0 = 1L
PARQUET_1_0 = 0L, PARQUET_2_0 = 1L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
)

#' @export
Expand Down
99 changes: 49 additions & 50 deletions r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,30 +83,29 @@ read_parquet <- function(file,
#' @param sink A string file path, URI, or [OutputStream], or path in a file
#' system (`SubTreeFileSystem`)
#' @param chunk_size how many rows of data to write to disk at once. This
#' directly corresponds to how many rows will be in each row group in parquet.
#' If `NULL`, a best guess will be made for optimal size (based on the number of
#' columns and number of rows), though if the data has fewer than 250 million
#' cells (rows x cols), then the total number of rows is used.
#' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values
#' are coerced to character.
#' directly corresponds to how many rows will be in each row group in
#' parquet. If `NULL`, a best guess will be made for optimal size (based on
#' the number of columns and number of rows), though if the data has fewer
#' than 250 million cells (rows x cols), then the total number of rows is
#' used.
#' @param version parquet version: "1.0", "2.0" (deprecated), "2.4" (default),
#' "2.6", or "latest" (currently equivalent to 2.6). Numeric values are
#' coerced to character.
#' @param compression compression algorithm. Default "snappy". See details.
#' @param compression_level compression level. Meaning depends on compression algorithm
#' @param use_dictionary Specify if we should use dictionary encoding. Default `TRUE`
#' @param write_statistics Specify if we should write statistics. Default `TRUE`
#' @param compression_level compression level. Meaning depends on compression
#' algorithm
#' @param use_dictionary logical: use dictionary encoding? Default `TRUE`
#' @param write_statistics logical: include statistics? Default `TRUE`
#' @param data_page_size Set a target threshold for the approximate encoded
#' size of data pages within a column chunk (in bytes). Default 1 MiB.
#' @param use_deprecated_int96_timestamps Write timestamps to INT96 Parquet format. Default `FALSE`.
#' @param use_deprecated_int96_timestamps logical: write timestamps to INT96
#' Parquet format, which has been deprecated? Default `FALSE`.
#' @param coerce_timestamps Cast timestamps a particular resolution. Can be
#' `NULL`, "ms" or "us". Default `NULL` (no casting)
#' @param allow_truncated_timestamps Allow loss of data when coercing timestamps to a
#' particular resolution. E.g. if microsecond or nanosecond data is lost when coercing
#' to "ms", do not raise an exception
#' @param properties A `ParquetWriterProperties` object, used instead of the options
#' enumerated in this function's signature. Providing `properties` as an argument
#' is deprecated; if you need to assemble `ParquetWriterProperties` outside
#' of `write_parquet()`, use `ParquetFileWriter` instead.
#' @param arrow_properties A `ParquetArrowWriterProperties` object. Like
#' `properties`, this argument is deprecated.
#' @param allow_truncated_timestamps logical: Allow loss of data when coercing
#' timestamps to a particular resolution. E.g. if microsecond or nanosecond
#' data is lost when coercing to "ms", do not raise an exception. Default
#' `FALSE`.
#'
#' @details The parameters `compression`, `compression_level`, `use_dictionary` and
#' `write_statistics` support various patterns:
Expand All @@ -128,7 +127,7 @@ read_parquet <- function(file,
#' Note that "uncompressed" columns may still have dictionary encoding.
#'
#' @return the input `x` invisibly.
#'
#' @seealso [ParquetFileWriter] for a lower-level interface to Parquet writing.
#' @examplesIf arrow_with_parquet()
#' tf1 <- tempfile(fileext = ".parquet")
#' write_parquet(data.frame(x = 1:5), tf1)
Expand All @@ -143,7 +142,7 @@ write_parquet <- function(x,
sink,
chunk_size = NULL,
# writer properties
version = NULL,
version = "2.4",
compression = default_parquet_compression(),
compression_level = NULL,
use_dictionary = NULL,
Expand All @@ -152,9 +151,7 @@ write_parquet <- function(x,
# arrow writer properties
use_deprecated_int96_timestamps = FALSE,
coerce_timestamps = NULL,
allow_truncated_timestamps = FALSE,
properties = NULL,
arrow_properties = NULL) {
allow_truncated_timestamps = FALSE) {
x_out <- x
x <- as_writable_table(x)

Expand All @@ -163,24 +160,10 @@ write_parquet <- function(x,
on.exit(sink$close())
}

# Deprecation warnings
if (!is.null(properties)) {
warning(
"Providing 'properties' is deprecated. If you need to assemble properties outside ",
"this function, use ParquetFileWriter instead."
)
}
if (!is.null(arrow_properties)) {
warning(
"Providing 'arrow_properties' is deprecated. If you need to assemble arrow_properties ",
"outside this function, use ParquetFileWriter instead."
)
}

writer <- ParquetFileWriter$create(
x$schema,
sink,
properties = properties %||% ParquetWriterProperties$create(
properties = ParquetWriterProperties$create(
names(x),
version = version,
compression = compression,
Expand All @@ -189,7 +172,7 @@ write_parquet <- function(x,
write_statistics = write_statistics,
data_page_size = data_page_size
),
arrow_properties = arrow_properties %||% ParquetArrowWriterProperties$create(
arrow_properties = ParquetArrowWriterProperties$create(
use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
coerce_timestamps = coerce_timestamps,
allow_truncated_timestamps = allow_truncated_timestamps
Expand Down Expand Up @@ -238,19 +221,35 @@ ParquetArrowWriterProperties$create <- function(use_deprecated_int96_timestamps

valid_parquet_version <- c(
"1.0" = ParquetVersionType$PARQUET_1_0,
"2.0" = ParquetVersionType$PARQUET_2_0
"2.0" = ParquetVersionType$PARQUET_2_0,
"2.4" = ParquetVersionType$PARQUET_2_4,
"2.6" = ParquetVersionType$PARQUET_2_6,
"latest" = ParquetVersionType$PARQUET_2_6
)

make_valid_version <- function(version, valid_versions = valid_parquet_version) {
make_valid_parquet_version <- function(version, valid_versions = valid_parquet_version) {
if (is_integerish(version)) {
version <- as.character(version)
version <- as.numeric(version)
}
tryCatch(
valid_versions[[match.arg(version, choices = names(valid_versions))]],
error = function(cond) {
stop('"version" should be one of ', oxford_paste(names(valid_versions), "or"), call. = FALSE)
}
)
if (is.numeric(version)) {
version <- format(version, nsmall = 1)
}

if (!is.string(version)) {
stop(
"`version` must be one of ", oxford_paste(names(valid_versions), "or"),
call. = FALSE
)
}
out <- valid_versions[[arg_match(version, values = names(valid_versions))]]

if (identical(out, ParquetVersionType$PARQUET_2_0)) {
warning(
'Parquet format version "2.0" is deprecated. Use "2.4" or "2.6" to select format features.',
call. = FALSE
)
}
out
}

#' @title ParquetWriterProperties class
Expand Down Expand Up @@ -300,7 +299,7 @@ ParquetWriterPropertiesBuilder <- R6Class("ParquetWriterPropertiesBuilder",
inherit = ArrowObject,
public = list(
set_version = function(version) {
parquet___WriterProperties___Builder__version(self, make_valid_version(version))
parquet___WriterProperties___Builder__version(self, make_valid_parquet_version(version))
},
set_compression = function(column_names, compression) {
compression <- compression_from_name(compression)
Expand Down
2 changes: 1 addition & 1 deletion r/man/enums.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 23 additions & 25 deletions r/man/write_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/tests/testthat/_snaps/dataset-write.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@
write_dataset(df, dst_dir, format = "parquet", nonsensical_arg = "blah-blah")
Error <rlang_error>
`nonsensical_arg` is not a valid argument for your chosen `format`.
i Supported arguments: `chunk_size`, `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, `use_deprecated_int96_timestamps`, `coerce_timestamps`, `allow_truncated_timestamps`, `properties`, and `arrow_properties`.
i Supported arguments: `chunk_size`, `version`, `compression`, `compression_level`, `use_dictionary`, `write_statistics`, `data_page_size`, `use_deprecated_int96_timestamps`, `coerce_timestamps`, and `allow_truncated_timestamps`.

52 changes: 44 additions & 8 deletions r/tests/testthat/test-parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,51 @@ test_that("write_parquet() can truncate timestamps", {
expect_equal(as.data.frame(tab), as.data.frame(new))
})

test_that("make_valid_version()", {
expect_equal(make_valid_version("1.0"), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_version("2.0"), ParquetVersionType$PARQUET_2_0)
test_that("make_valid_parquet_version()", {
expect_equal(
make_valid_parquet_version("1.0"),
ParquetVersionType$PARQUET_1_0
)
expect_deprecated(
expect_equal(
make_valid_parquet_version("2.0"),
ParquetVersionType$PARQUET_2_0
)
)
expect_equal(
make_valid_parquet_version("2.4"),
ParquetVersionType$PARQUET_2_4
)
expect_equal(
make_valid_parquet_version("2.6"),
ParquetVersionType$PARQUET_2_6
)
expect_equal(
make_valid_parquet_version("latest"),
ParquetVersionType$PARQUET_2_6
)

expect_equal(make_valid_version(1), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_version(2), ParquetVersionType$PARQUET_2_0)
expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0)
expect_deprecated(
expect_equal(make_valid_parquet_version(2), ParquetVersionType$PARQUET_2_0)
)
expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4)
})

expect_equal(make_valid_version(1.0), ParquetVersionType$PARQUET_1_0)
expect_equal(make_valid_version(2.0), ParquetVersionType$PARQUET_2_0)
test_that("make_valid_parquet_version() input validation", {
expect_error(
make_valid_parquet_version("0.3.14"),
"`version` must be one of"
)
expect_error(
make_valid_parquet_version(NULL),
"`version` must be one of"
)
expect_error(
make_valid_parquet_version(c("2", "4")),
"`version` must be one of"
)
})

test_that("write_parquet() defaults to snappy compression", {
Expand Down Expand Up @@ -239,7 +275,7 @@ test_that("write_parquet() handles version argument", {
tf <- tempfile()
on.exit(unlink(tf))

purrr::walk(list("1.0", "2.0", 1.0, 2.0, 1L, 2L), ~ {
purrr::walk(list("1.0", "2.4", "2.6", "latest", 1.0, 2.4, 2.6, 1L), ~ {
write_parquet(df, tf, version = .x)
expect_identical(read_parquet(tf), df)
})
Expand Down

0 comments on commit f0ff8d0

Please sign in to comment.