You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2022/08/02 21:35:17 UTC

[arrow] branch master updated: ARROW-17088: [R] Use `.arrow` as extension of IPC files of datasets (#13690)

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8cac69c809 ARROW-17088: [R] Use `.arrow` as extension of IPC files of datasets (#13690)
8cac69c809 is described below

commit 8cac69c809e2ae9d4ba9c10c7b22869c1fd11323
Author: mopcup <40...@users.noreply.github.com>
AuthorDate: Wed Aug 3 06:35:10 2022 +0900

    ARROW-17088: [R] Use `.arrow` as extension of IPC files of datasets (#13690)
    
    Lead-authored-by: mopcup <mo...@gmail.com>
    Co-authored-by: mopcup <40...@users.noreply.github.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/R/dataset-write.R                   |  8 +++++--
 r/man/write_dataset.Rd                |  5 +++--
 r/tests/testthat/test-dataset-write.R | 42 ++++++++++++++++++++++++++++++++---
 3 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
index 496aaad205..e0181ee74f 100644
--- a/r/R/dataset-write.R
+++ b/r/R/dataset-write.R
@@ -34,8 +34,9 @@
 #' use the current `group_by()` columns.
 #' @param basename_template string template for the names of files to be written.
 #' Must contain `"{i}"`, which will be replaced with an autoincremented
-#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"`
-#' will yield `"part-0.feather", ...`.
+#' integer to generate basenames of datafiles. For example, `"part-{i}.arrow"`
+#' will yield `"part-0.arrow", ...`.
+#' If not specified, it defaults to `"part-{i}.<default extension>"`.
 #' @param hive_style logical: write partition segments as Hive-style
 #' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`.
 #' @param existing_data_behavior The behavior to use when there is already data
@@ -133,6 +134,9 @@ write_dataset <- function(dataset,
                           max_rows_per_group = bitwShiftL(1, 20),
                           ...) {
   format <- match.arg(format)
+  if (format %in% c("feather", "ipc")) {
+    format <- "arrow"
+  }
   if (inherits(dataset, "arrow_dplyr_query")) {
     # partitioning vars need to be in the `select` schema
     dataset <- ensure_group_vars(dataset)
diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd
index 8fc07d5cc7..1bc940697c 100644
--- a/r/man/write_dataset.Rd
+++ b/r/man/write_dataset.Rd
@@ -38,8 +38,9 @@ use the current \code{group_by()} columns.}
 
 \item{basename_template}{string template for the names of files to be written.
 Must contain \code{"{i}"}, which will be replaced with an autoincremented
-integer to generate basenames of datafiles. For example, \code{"part-{i}.feather"}
-will yield \verb{"part-0.feather", ...}.}
+integer to generate basenames of datafiles. For example, \code{"part-{i}.arrow"}
+will yield \verb{"part-0.arrow", ...}.
+If not specified, it defaults to \code{"part-{i}.<default extension>"}.}
 
 \item{hive_style}{logical: write partition segments as Hive-style
 (\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.}
diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R
index 2f4ff7e649..7a5f861ca5 100644
--- a/r/tests/testthat/test-dataset-write.R
+++ b/r/tests/testthat/test-dataset-write.R
@@ -63,7 +63,7 @@ test_that("Writing a dataset: CSV->IPC", {
 
   # Check whether "int" is present in the files or just in the dirs
   first <- read_feather(
-    dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1],
+    dir(dst_dir, pattern = ".arrow$", recursive = TRUE, full.names = TRUE)[1],
     as_data_frame = FALSE
   )
   # It shouldn't be there
@@ -139,6 +139,40 @@ test_that("Writing a dataset: Parquet->Parquet (default)", {
   )
 })
 
+test_that("Writing a dataset: `basename_template` default behavier", {
+  ds <- open_dataset(csv_dir, partitioning = "part", format = "csv")
+
+  dst_dir <- make_temp_dir()
+  write_dataset(ds, dst_dir, format = "parquet", max_rows_per_file = 5L)
+  expect_identical(
+    dir(dst_dir, full.names = FALSE, recursive = TRUE),
+    paste0("part-", 0:3, ".parquet")
+  )
+  dst_dir <- make_temp_dir()
+  write_dataset(ds, dst_dir, format = "parquet", basename_template = "{i}.data", max_rows_per_file = 5L)
+  expect_identical(
+    dir(dst_dir, full.names = FALSE, recursive = TRUE),
+    paste0(0:3, ".data")
+  )
+  dst_dir <- make_temp_dir()
+  expect_error(
+    write_dataset(ds, dst_dir, format = "parquet", basename_template = "part-i.parquet"),
+    "basename_template did not contain '\\{i\\}'"
+  )
+  feather_dir <- make_temp_dir()
+  write_dataset(ds, feather_dir, format = "feather", partitioning = "int")
+  expect_identical(
+    dir(feather_dir, full.names = FALSE, recursive = TRUE),
+    sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
+  )
+  ipc_dir <- make_temp_dir()
+  write_dataset(ds, ipc_dir, format = "ipc", partitioning = "int")
+  expect_identical(
+    dir(ipc_dir, full.names = FALSE, recursive = TRUE),
+    sort(paste(paste("int", c(1:10, 101:110), sep = "="), "part-0.arrow", sep = "/"))
+  )
+})
+
 test_that("Writing a dataset: existing data behavior", {
   # This test does not work on Windows because unlink does not immediately
   # delete the data.
@@ -458,8 +492,10 @@ test_that("Writing a dataset: CSV format options", {
 test_that("Dataset writing: unsupported features/input validation", {
   skip_if_not_available("parquet")
   expect_error(write_dataset(4), "You must supply a")
-  expect_error(write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
-               "Field names must be unique")
+  expect_error(
+    write_dataset(data.frame(x = 1, x = 2, check.names = FALSE)),
+    "Field names must be unique"
+  )
 
   ds <- open_dataset(hive_dir)
   expect_error(