You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2022/08/02 21:35:17 UTC
[arrow] branch master updated: ARROW-17088: [R] Use `.arrow` as extension of IPC files of datasets (#13690)
This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8cac69c809 ARROW-17088: [R] Use `.arrow` as extension of IPC files of datasets (#13690)
8cac69c809 is described below
commit 8cac69c809e2ae9d4ba9c10c7b22869c1fd11323
Author: mopcup <40...@users.noreply.github.com>
AuthorDate: Wed Aug 3 06:35:10 2022 +0900
ARROW-17088: [R] Use `.arrow` as extension of IPC files of datasets (#13690)
Lead-authored-by: mopcup <mo...@gmail.com>
Co-authored-by: mopcup <40...@users.noreply.github.com>
Signed-off-by: Neal Richardson <ne...@gmail.com>
---
r/R/dataset-write.R | 8 +++++--
r/man/write_dataset.Rd | 5 +++--
r/tests/testthat/test-dataset-write.R | 42 ++++++++++++++++++++++++++++++++---
3 files changed, 48 insertions(+), 7 deletions(-)
diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
index 496aaad205..e0181ee74f 100644
--- a/r/R/dataset-write.R
+++ b/r/R/dataset-write.R
@@ -34,8 +34,9 @@
#' use the current `group_by()` columns.
#' @param basename_template string template for the names of files to be written.
#' Must contain `"{i}"`, which will be replaced with an autoincremented
-#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"`
-#' will yield `"part-0.feather", ...`.
+#' integer to generate basenames of datafiles. For example, `"part-{i}.arrow"`
+#' will yield `"part-0.arrow", ...`.
+#' If not specified, it defaults to `"part-{i}.<default extension>"`.
#' @param hive_style logical: write partition segments as Hive-style
#' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
#' @param existing_data_behavior The behavior to use when there is already data
@@ -133,6 +134,9 @@ write_dataset <- function(dataset,
max_rows_per_group = bitwShiftL(1, 20),
...) {
format <- match.arg(format)
+ if (format %in% c("feather", "ipc")) {
+ format <- "arrow"
+ }
if (inherits(dataset, "arrow_dplyr_query")) {
# partitioning vars need to be in the `select` schema
dataset <- ensure_group_vars(dataset)
diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd
index 8fc07d5cc7..1bc940697c 100644
--- a/r/man/write_dataset.Rd
+++ b/r/man/write_dataset.Rd
@@ -38,8 +38,9 @@ use the current \code{group_by()} columns.}
\item{basename_template}{string template for the names of files to be written.
Must contain \code{"{i}"}, which will be replaced with an autoincremented
-integer to generate basenames of datafiles. For example, \code{"part-{i}.feather"}
-will yield \verb{"part-0.feather", ...}.}
+integer to generate basenames of datafiles. For example, \code{"part-{i}.arrow"}
+will yield \verb{"part-0.arrow", ...}.
+If not specified, it defaults to \code{"part-{i}.<default extension>"}.}
\item{hive_style}{logical: write partition segments as Hive-style
(\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
index 2f4ff7e649..7a5f861ca5 100644
--- a/r/tests/testthat/test-dataset-write.R
+++ b/r/tests/testthat/test-dataset-write.R
@@ -63,7 +63,7 @@ test_that("Writing a dataset: CSV->IPC", {
# Check whether "int" is present in the files or just in the dirs
first <- read_feather(
- dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1],
+ dir(dst_dir, pattern = ".arrow$", recursive = TRUE, full.names = TRUE)[1],
as_data_frame = FALSE
)
# It shouldn't be there
@@ -139,6 +139,40 @@ test_that("Writing a dataset: Parquet->Parquet (default)", {
)
})
+test_that("Writing a dataset: `basename_template` default behavier", {
+ ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "parquet", max_rows_per_file = 5L)
+ expect_identical(
+ dir(dst_dir, full.names = FALSE, recursive = TRUE),
+ paste0("part-", 0:3, ".parquet")
+ )
+ dst_dir <- make_temp_dir()
+ write_dataset(ds, dst_dir, format = "parquet", basename_template = "{i}.data", max_rows_per_file = 5L)
+ expect_identical(
+ dir(dst_dir, full.names = FALSE, recursive = TRUE),
+ paste0(0:3, ".data")
+ )
+ dst_dir <- make_temp_dir()
+ expect_error(
+ write_dataset(ds, dst_dir, format = "parquet", basename_template = "part-i.parquet"),
+ "basename_template did not contain '\\{i\\}'"
+ )
+ feather_dir <- make_temp_dir()
+ write_dataset(ds, feather_dir, format = "feather", partitioning = "int")
+ expect_identical(
+ dir(feather_dir, full.names = FALSE, recursive = TRUE),
+ sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
+ )
+ ipc_dir <- make_temp_dir()
+ write_dataset(ds, ipc_dir, format = "ipc", partitioning = "int")
+ expect_identical(
+ dir(ipc_dir, full.names = FALSE, recursive = TRUE),
+ sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
+ )
+})
+
test_that("Writing a dataset: existing data behavior", {
# This test does not work on Windows because unlink does not immediately
# delete the data.
@@ -458,8 +492,10 @@ test_that("Writing a dataset: CSV format options", {
test_that("Dataset writing: unsupported features/input validation", {
skip_if_not_available("parquet")
expect_error(write_dataset(4), "You must supply a")
- expect_error(write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
- "Field names must be unique")
+ expect_error(
+ write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
+ "Field names must be unique"
+ )
ds <- open_dataset(hive_dir)
expect_error(