You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2023/01/09 12:19:25 UTC
[arrow] branch master updated: ARROW-15470: [R] Set null value in CSV writer (#14679)
This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 878d5cac09 ARROW-15470: [R] Set null value in CSV writer (#14679)
878d5cac09 is described below
commit 878d5cac09073cee72de000a7e8418ce8a3a31b8
Author: Will Jones <wi...@gmail.com>
AuthorDate: Mon Jan 9 04:19:15 2023 -0800
ARROW-15470: [R] Set null value in CSV writer (#14679)
Authored-by: Will Jones <wi...@gmail.com>
Signed-off-by: Nic Crane <th...@gmail.com>
---
r/R/csv.R | 29 +++++++++++++++++-------
r/R/dataset-format.R | 32 +++++++++++++++++++++-----
r/man/CsvReadOptions.Rd | 2 ++
r/man/write_csv_arrow.Rd | 4 ++++
r/src/csv.cpp | 1 +
r/tests/testthat/_snaps/dataset-write.md | 2 +-
r/tests/testthat/test-csv.R | 39 ++++++++++++++++++++++++++++++++
r/tests/testthat/test-dataset-csv.R | 22 ++++++++++++++++++
8 files changed, 116 insertions(+), 15 deletions(-)
diff --git a/r/R/csv.R b/r/R/csv.R
index fef8723fb2..6f53a060f5 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -418,6 +418,8 @@ CsvTableReader$create <- function(file,
#' The `CsvWriteOptions$create()` factory method takes the following arguments:
#' - `include_header` Whether to write an initial header line with column names
#' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
+#' - `null_string` The string to be written for null values. Must not contain
+#' quotation marks. Default is an empty string (`""`).
#'
#' @section Active bindings:
#'
@@ -455,25 +457,32 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
options
}
-readr_to_csv_write_options <- function(include_header,
- batch_size = 1024L) {
- assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
- assert_that(is.logical(include_header))
+readr_to_csv_write_options <- function(include_header = TRUE,
+ batch_size = 1024L,
+ na = "") {
CsvWriteOptions$create(
include_header = include_header,
- batch_size = as.integer(batch_size)
+ batch_size = batch_size,
+ null_string = na
)
}
#' @rdname CsvReadOptions
#' @export
CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
-CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L) {
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, null_string = "") {
assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
+ assert_that(is.logical(include_header))
+ assert_that(is.character(null_string))
+ assert_that(!is.na(null_string))
+ assert_that(length(null_string) == 1)
+ assert_that(!grepl('"', null_string), msg = "na argument must not contain quote characters.")
+
csv___WriteOptions__initialize(
list(
include_header = include_header,
- batch_size = as.integer(batch_size)
+ batch_size = as.integer(batch_size),
+ null_string = as.character(null_string)
)
)
}
@@ -665,6 +674,8 @@ readr_to_csv_convert_options <- function(na,
#' @param col_names identical to `include_header`. Specify this or
#' `include_headers`, not both.
#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#' @param na value to write for NA values. Must not contain quote marks. Default
+#' is `""`.
#' @param write_options see [file reader options][CsvWriteOptions]
#' @param ... additional parameters
#'
@@ -682,6 +693,7 @@ write_csv_arrow <- function(x,
include_header = TRUE,
col_names = NULL,
batch_size = 1024L,
+ na = "",
write_options = NULL,
...) {
unsupported_passed_args <- names(list(...))
@@ -723,7 +735,8 @@ write_csv_arrow <- function(x,
if (is.null(write_options)) {
write_options <- readr_to_csv_write_options(
include_header = include_header,
- batch_size = batch_size
+ batch_size = batch_size,
+ na = na
)
}
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index aacde187c4..c1d2730bb6 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -452,7 +452,10 @@ FileWriteOptions <- R6Class("FileWriteOptions",
"null_fallback"
)
} else if (format == "csv") {
- supported_args <- names(formals(CsvWriteOptions$create))
+ supported_args <- c(
+ names(formals(CsvWriteOptions$create)),
+ names(formals(readr_to_csv_write_options))
+ )
}
unsupported_passed_args <- setdiff(passed_args, supported_args)
@@ -470,7 +473,7 @@ FileWriteOptions <- R6Class("FileWriteOptions",
err_info <- NULL
arg_info <- paste0(
"Supported arguments: ",
- oxford_paste(supported_args, quote_symbol = "`"),
+ oxford_paste(unique(supported_args), quote_symbol = "`"),
"."
)
if ("compression" %in% unsupported_passed_args) {
@@ -505,10 +508,27 @@ FileWriteOptions <- R6Class("FileWriteOptions",
)
}
} else if (self$type == "csv") {
- dataset___CsvFileWriteOptions__update(
- self,
- CsvWriteOptions$create(...)
- )
+ arrow_opts <- names(formals(CsvWriteOptions$create))
+ readr_opts <- names(formals(readr_to_csv_write_options))
+ readr_only_opts <- setdiff(readr_opts, arrow_opts)
+
+ is_arrow_opt <- !is.na(pmatch(names(args), arrow_opts))
+ is_readr_opt <- !is.na(pmatch(names(args), readr_opts))
+ is_readr_only_opt <- !is.na(pmatch(names(args), readr_only_opts))
+
+ # These option names aren't mutually exclusive, so only use readr path
+ # if we have at least one readr-specific option.
+ if (sum(is_readr_only_opt)) {
+ dataset___CsvFileWriteOptions__update(
+ self,
+ do.call(readr_to_csv_write_options, args[is_readr_opt])
+ )
+ } else {
+ dataset___CsvFileWriteOptions__update(
+ self,
+ do.call(CsvWriteOptions$create, args[is_arrow_opt])
+ )
+ }
}
invisible(self)
}
diff --git a/r/man/CsvReadOptions.Rd b/r/man/CsvReadOptions.Rd
index a3cf2073ee..270d522b83 100644
--- a/r/man/CsvReadOptions.Rd
+++ b/r/man/CsvReadOptions.Rd
@@ -96,6 +96,8 @@ The \code{CsvWriteOptions$create()} factory method takes the following arguments
\itemize{
\item \code{include_header} Whether to write an initial header line with column names
\item \code{batch_size} Maximum number of rows processed at a time. Default is 1024.
+\item \code{null_string} The string to be written for null values. Must not contain
+quotation marks. Default is an empty string (\code{""}).
}
}
diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd
index c93c94fd8f..2b0d09ba74 100644
--- a/r/man/write_csv_arrow.Rd
+++ b/r/man/write_csv_arrow.Rd
@@ -11,6 +11,7 @@ write_csv_arrow(
include_header = TRUE,
col_names = NULL,
batch_size = 1024L,
+ na = "",
write_options = NULL,
...
)
@@ -30,6 +31,9 @@ system (\code{SubTreeFileSystem})}
\item{batch_size}{Maximum number of rows processed at a time. Default is 1024.}
+\item{na}{value to write for NA values. Must not contain quote marks. Default
+is \code{""}.}
+
\item{write_options}{see \link[=CsvWriteOptions]{file reader options}}
\item{...}{additional parameters}
diff --git a/r/src/csv.cpp b/r/src/csv.cpp
index 7747369300..3f880cae16 100644
--- a/r/src/csv.cpp
+++ b/r/src/csv.cpp
@@ -32,6 +32,7 @@ std::shared_ptr<arrow::csv::WriteOptions> csv___WriteOptions__initialize(
res->include_header = cpp11::as_cpp<bool>(options["include_header"]);
res->batch_size = cpp11::as_cpp<int>(options["batch_size"]);
res->io_context = MainRThread::GetInstance().CancellableIOContext();
+ res->null_string = cpp11::as_cpp<std::string>(options["null_string"]);
return res;
}
diff --git a/r/tests/testthat/_snaps/dataset-write.md b/r/tests/testthat/_snaps/dataset-write.md
index cea2a30a29..e302d8463d 100644
--- a/r/tests/testthat/_snaps/dataset-write.md
+++ b/r/tests/testthat/_snaps/dataset-write.md
@@ -42,7 +42,7 @@
Condition
Error in `check_additional_args()`:
! `nonsensical_arg` is not a valid argument for your chosen `format`.
- i Supported arguments: `include_header` and `batch_size`.
+ i Supported arguments: `include_header`, `batch_size`, `null_string`, and `na`.
---
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index 6033253517..ee290e3241 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -422,6 +422,45 @@ test_that("Write a CSV file with invalid batch size", {
)
})
+test_that("Write a CSV with custom NA value", {
+ tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, na = "NULL_VALUE")
+ expect_true(file.exists(csv_file))
+ expect_identical(tbl_out1, tbl_no_dates)
+
+ csv_contents <- readLines(csv_file)
+ expect_true(any(grepl("NULL_VALUE", csv_contents)))
+
+ tbl_in1 <- read_csv_arrow(csv_file, na = "NULL_VALUE")
+ expect_identical(tbl_in1, tbl_no_dates)
+
+ # Also can use null_value in CsvWriteOptions
+ tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file,
+ write_options = CsvWriteOptions$create(null_string = "another_null")
+ )
+ csv_contents <- readLines(csv_file)
+ expect_true(any(grepl("another_null", csv_contents)))
+
+ tbl_in1 <- read_csv_arrow(csv_file, na = "another_null")
+ expect_identical(tbl_in1, tbl_no_dates)
+
+ # Also can use empty string
+ write_csv_arrow(tbl_no_dates, csv_file, na = "")
+ expect_true(file.exists(csv_file))
+
+ csv_contents <- readLines(csv_file)
+ expect_true(any(grepl(",,", csv_contents)))
+
+ tbl_in1 <- read_csv_arrow(csv_file)
+ expect_identical(tbl_in1, tbl_no_dates)
+})
+
+test_that("Write a CSV file with invalid null value", {
+ expect_error(
+ write_csv_arrow(tbl_no_dates, csv_file, na = "MY\"VAL"),
+ regexp = "must not contain quote characters"
+ )
+})
+
test_that("time mapping work as expected (ARROW-13624)", {
tbl <- tibble::tibble(
dt = as.POSIXct(c("2020-07-20 16:20", NA), tz = "UTC"),
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
index 29e95f509c..3f85c89e45 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -273,6 +273,28 @@ test_that("readr parse options", {
)
})
+test_that("Can set null string values", {
+ dst_dir <- make_temp_dir()
+ df <- tibble(x = c(1, NA, 3))
+ write_dataset(df, dst_dir, null_string = "NULL_VALUE", format = "csv")
+
+ csv_contents <- readLines(list.files(dst_dir, full.names = TRUE)[1])
+ expect_equal(csv_contents, c("\"x\"", "1", "NULL_VALUE", "3"))
+
+ back <- open_dataset(dst_dir, null_values = "NULL_VALUE", format = "csv") %>% collect()
+ expect_equal(df, back)
+
+ # Also works with `na` parameter
+ dst_dir <- make_temp_dir()
+ write_dataset(df, dst_dir, na = "another_null", format = "csv")
+
+ csv_contents <- readLines(list.files(dst_dir, full.names = TRUE)[1])
+ expect_equal(csv_contents, c("\"x\"", "1", "another_null", "3"))
+
+ back <- open_dataset(dst_dir, null_values = "another_null", format = "csv") %>% collect()
+ expect_equal(df, back)
+})
+
# see https://issues.apache.org/jira/browse/ARROW-12791
test_that("Error if no format specified and files are not parquet", {
expect_error(