You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2023/01/09 12:19:25 UTC

[arrow] branch master updated: ARROW-15470: [R] Set null value in CSV writer (#14679)

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 878d5cac09 ARROW-15470: [R] Set null value in CSV writer (#14679)
878d5cac09 is described below

commit 878d5cac09073cee72de000a7e8418ce8a3a31b8
Author: Will Jones <wi...@gmail.com>
AuthorDate: Mon Jan 9 04:19:15 2023 -0800

    ARROW-15470: [R] Set null value in CSV writer (#14679)
    
    Authored-by: Will Jones <wi...@gmail.com>
    Signed-off-by: Nic Crane <th...@gmail.com>
---
 r/R/csv.R                                | 29 +++++++++++++++++-------
 r/R/dataset-format.R                     | 32 +++++++++++++++++++++-----
 r/man/CsvReadOptions.Rd                  |  2 ++
 r/man/write_csv_arrow.Rd                 |  4 ++++
 r/src/csv.cpp                            |  1 +
 r/tests/testthat/_snaps/dataset-write.md |  2 +-
 r/tests/testthat/test-csv.R              | 39 ++++++++++++++++++++++++++++++++
 r/tests/testthat/test-dataset-csv.R      | 22 ++++++++++++++++++
 8 files changed, 116 insertions(+), 15 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index fef8723fb2..6f53a060f5 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -418,6 +418,8 @@ CsvTableReader$create <- function(file,
 #' The `CsvWriteOptions$create()` factory method takes the following arguments:
 #' - `include_header` Whether to write an initial header line with column names
 #' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
+#' - `null_string` The string to be written for null values. Must not contain
+#'   quotation marks. Default is an empty string (`""`).
 #'
 #' @section Active bindings:
 #'
@@ -455,25 +457,32 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
   options
 }
 
-readr_to_csv_write_options <- function(include_header,
-                                       batch_size = 1024L) {
-  assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
-  assert_that(is.logical(include_header))
+readr_to_csv_write_options <- function(include_header = TRUE,
+                                       batch_size = 1024L,
+                                       na = "") {
   CsvWriteOptions$create(
     include_header = include_header,
-    batch_size = as.integer(batch_size)
+    batch_size = batch_size,
+    null_string = na
   )
 }
 
 #' @rdname CsvReadOptions
 #' @export
 CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) {
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, null_string = "") {
   assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+  assert_that(is.logical(include_header))
+  assert_that(is.character(null_string))
+  assert_that(!is.na(null_string))
+  assert_that(length(null_string) == 1)
+  assert_that(!grepl('"', null_string), msg = "na argument must not contain quote characters.")
+
   csv___WriteOptions__initialize(
     list(
       include_header = include_header,
-      batch_size = as.integer(batch_size)
+      batch_size = as.integer(batch_size),
+      null_string = as.character(null_string)
     )
   )
 }
@@ -665,6 +674,8 @@ readr_to_csv_convert_options <- function(na,
 #' @param col_names identical to `include_header`. Specify this or
 #'     `include_headers`, not both.
 #' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#' @param na value to write for NA values. Must not contain quote marks. Default
+#'     is `""`.
 #' @param write_options see [file reader options][CsvWriteOptions]
 #' @param ... additional parameters
 #'
@@ -682,6 +693,7 @@ write_csv_arrow <- function(x,
                             include_header = TRUE,
                             col_names = NULL,
                             batch_size = 1024L,
+                            na = "",
                             write_options = NULL,
                             ...) {
   unsupported_passed_args <- names(list(...))
@@ -723,7 +735,8 @@ write_csv_arrow <- function(x,
   if (is.null(write_options)) {
     write_options <- readr_to_csv_write_options(
       include_header = include_header,
-      batch_size = batch_size
+      batch_size = batch_size,
+      na = na
     )
   }
 
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index aacde187c4..c1d2730bb6 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -452,7 +452,10 @@ FileWriteOptions <- R6Class("FileWriteOptions",
             "null_fallback"
           )
         } else if (format == "csv") {
-          supported_args <- names(formals(CsvWriteOptions$create))
+          supported_args <- c(
+            names(formals(CsvWriteOptions$create)),
+            names(formals(readr_to_csv_write_options))
+          )
         }
 
         unsupported_passed_args <- setdiff(passed_args, supported_args)
@@ -470,7 +473,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
           err_info <- NULL
           arg_info <- paste0(
             "Supported arguments: ",
-            oxford_paste(supported_args, quote_symbol = "`"),
+            oxford_paste(unique(supported_args), quote_symbol = "`"),
             "."
           )
           if ("compression" %in% unsupported_passed_args) {
@@ -505,10 +508,27 @@ FileWriteOptions <- R6Class("FileWriteOptions",
           )
         }
       } else if (self$type == "csv") {
-        dataset___CsvFileWriteOptions__update(
-          self,
-          CsvWriteOptions$create(...)
-        )
+        arrow_opts <- names(formals(CsvWriteOptions$create))
+        readr_opts <- names(formals(readr_to_csv_write_options))
+        readr_only_opts <- setdiff(readr_opts, arrow_opts)
+
+        is_arrow_opt <- !is.na(pmatch(names(args), arrow_opts))
+        is_readr_opt <- !is.na(pmatch(names(args), readr_opts))
+        is_readr_only_opt <- !is.na(pmatch(names(args), readr_only_opts))
+
+        # These option names aren't mutually exclusive, so only use readr path
+        # if we have at least one readr-specific option.
+        if (sum(is_readr_only_opt)) {
+          dataset___CsvFileWriteOptions__update(
+            self,
+            do.call(readr_to_csv_write_options, args[is_readr_opt])
+          )
+        } else {
+          dataset___CsvFileWriteOptions__update(
+            self,
+            do.call(CsvWriteOptions$create, args[is_arrow_opt])
+          )
+        }
       }
       invisible(self)
     }
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index a3cf2073ee..270d522b83 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -96,6 +96,8 @@ The \code{CsvWriteOptions$create()} factory method takes the following arguments
 \itemize{
 \item \code{include_header} Whether to write an initial header line with column names
 \item \code{batch_size} Maximum number of rows processed at a time. Default is 1024.
+\item \code{null_string} The string to be written for null values. Must not contain
+quotation marks. Default is an empty string (\code{""}).
 }
 }
 
diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd
index c93c94fd8f..2b0d09ba74 100644
--- a/r/man/write_csv_arrow.Rd
+++ b/r/man/write_csv_arrow.Rd
@@ -11,6 +11,7 @@ write_csv_arrow(
   include_header = TRUE,
   col_names = NULL,
   batch_size = 1024L,
+  na = "",
   write_options = NULL,
   ...
 )
@@ -30,6 +31,9 @@ system (\code{SubTreeFileSystem})}
 
 \item{batch_size}{Maximum number of rows processed at a time. Default is 1024.}
 
+\item{na}{value to write for NA values. Must not contain quote marks. Default
+is \code{""}.}
+
 \item{write_options}{see \link[=CsvWriteOptions]{file reader options}}
 
 \item{...}{additional parameters}
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index 7747369300..3f880cae16 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -32,6 +32,7 @@ std::shared_ptr<arrow::csv::WriteOptions> csv___WriteOptions__initialize(
   res->include_header = cpp11::as_cpp<bool>(options["include_header"]);
   res->batch_size = cpp11::as_cpp<int>(options["batch_size"]);
   res->io_context = MainRThread::GetInstance().CancellableIOContext();
+  res->null_string = cpp11::as_cpp<std::string>(options["null_string"]);
   return res;
 }
 
diff --git a/r/tests/testthat/_snaps/dataset-write.md b/r/tests/testthat/_snaps/dataset-write.md
index cea2a30a29..e302d8463d 100644
--- a/r/tests/testthat/_snaps/dataset-write.md
+++ b/r/tests/testthat/_snaps/dataset-write.md
@@ -42,7 +42,7 @@
     Condition
       Error in `check_additional_args()`:
       ! `nonsensical_arg` is not a valid argument for your chosen `format`.
-      i Supported arguments: `include_header` and `batch_size`.
+      i Supported arguments: `include_header`, `batch_size`, `null_string`, and `na`.
 
 ---
 
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 6033253517..ee290e3241 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -422,6 +422,45 @@ test_that("Write a CSV file with invalid batch size", {
   )
 })
 
+test_that("Write a CSV with custom NA value", {
+  tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, na = "NULL_VALUE")
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl_no_dates)
+
+  csv_contents <- readLines(csv_file)
+  expect_true(any(grepl("NULL_VALUE", csv_contents)))
+
+  tbl_in1 <- read_csv_arrow(csv_file, na = "NULL_VALUE")
+  expect_identical(tbl_in1, tbl_no_dates)
+
+  # Also can use null_value in CsvWriteOptions
+  tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file,
+    write_options = CsvWriteOptions$create(null_string = "another_null")
+  )
+  csv_contents <- readLines(csv_file)
+  expect_true(any(grepl("another_null", csv_contents)))
+
+  tbl_in1 <- read_csv_arrow(csv_file, na = "another_null")
+  expect_identical(tbl_in1, tbl_no_dates)
+
+  # Also can use empty string
+  write_csv_arrow(tbl_no_dates, csv_file, na = "")
+  expect_true(file.exists(csv_file))
+
+  csv_contents <- readLines(csv_file)
+  expect_true(any(grepl(",,", csv_contents)))
+
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl_no_dates)
+})
+
+test_that("Write a CSV file with invalid null value", {
+  expect_error(
+    write_csv_arrow(tbl_no_dates, csv_file, na = "MY\"VAL"),
+    regexp = "must not contain quote characters"
+  )
+})
+
 test_that("time mapping work as expected (ARROW-13624)", {
   tbl <- tibble::tibble(
     dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"),
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
index 29e95f509c..3f85c89e45 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -273,6 +273,28 @@ test_that("readr parse options", {
   )
 })
 
+test_that("Can set null string values", {
+  dst_dir <- make_temp_dir()
+  df <- tibble(x = c(1, NA, 3))
+  write_dataset(df, dst_dir, null_string = "NULL_VALUE", format = "csv")
+
+  csv_contents <- readLines(list.files(dst_dir, full.names = TRUE)[1])
+  expect_equal(csv_contents, c("\"x\"", "1", "NULL_VALUE", "3"))
+
+  back <- open_dataset(dst_dir, null_values = "NULL_VALUE", format = "csv") %>% collect()
+  expect_equal(df, back)
+
+  # Also works with `na` parameter
+  dst_dir <- make_temp_dir()
+  write_dataset(df, dst_dir, na = "another_null", format = "csv")
+
+  csv_contents <- readLines(list.files(dst_dir, full.names = TRUE)[1])
+  expect_equal(csv_contents, c("\"x\"", "1", "another_null", "3"))
+
+  back <- open_dataset(dst_dir, null_values = "another_null", format = "csv") %>% collect()
+  expect_equal(df, back)
+})
+
 # see https://issues.apache.org/jira/browse/ARROW-12791
 test_that("Error if no format specified and files are not parquet", {
   expect_error(