You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/04/29 15:05:09 UTC
[GitHub] [arrow] nealrichardson commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

nealrichardson commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r623135520



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)

Review comment:
       What happens if you remove this and pass a negative batch size?

##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {

Review comment:
       What is this testing? What does batch_size do? It doesn't look like there is an observable difference in the output.

##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {
+  
+  tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl_no_dates)
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl_no_dates)
+  
+  tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out2, tbl_no_dates)
+  tbl_in2 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in2, tbl_no_dates)
+  
+  tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out3, tbl_no_dates)
+  tbl_in3 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in3, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out1 <- write_csv_arrow(tbl, csv_file, batch_size = 1)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl)
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl)
+  
+  tbl_out2 <- write_csv_arrow(tbl, csv_file, batch_size = 2)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out2, tbl)
+  tbl_in2 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in2, tbl)
+  
+  tbl_out3 <- write_csv_arrow(tbl, csv_file, batch_size = 12)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out3, tbl)
+  tbl_in3 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in3, tbl)
+  
+})
+
+

Review comment:
       You should add tests for handling bad inputs too. Also might make more sense to put the writing tests at the bottom of the test file instead of the top.

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))
+  
+  write_options = CsvWriteOptions$create(include_header, batch_size)
+  
+  x_out <- x
+  if (is.data.frame(x)) {
+    x <- Table$create(x)
+  }
+  
+  assert_is(x, c("Table", "RecordBatch"))

Review comment:
       ```suggestion
     assert_is(x, "ArrowTabular")
   ```

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))
+  
+  write_options = CsvWriteOptions$create(include_header, batch_size)

Review comment:
       ```suggestion
     write_options <- CsvWriteOptions$create(include_header, batch_size)
   ```

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))

Review comment:
       What happens if you remove these--will the C++ static typing validate this enough?
   
   What happens if `include_header = NA`?

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,

Review comment:
       Looks like indentation is slightly off here

##########
File path: r/R/csv.R
##########
@@ -408,6 +408,34 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
   )
 }
 
+#' @title File writer options
+#' @rdname CsvWriteOptions
+#' @name CsvWriteOptions
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,

Review comment:
       An alternative to documenting this here (and cleaning up the bad copy-paste) would be to document it with `CsvReadOptions` et al.

##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")

Review comment:
       I don't think you need to test the file-with-dates in every combination of parameters, just the first one is sufficient.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org