You are viewing a plain text version of this content. The canonical link for it is here.

Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/04/23 13:39:12 UTC

[GitHub] [arrow] thisisnic opened a new pull request #10141: ARROW-11787: [R] Implement write csv [WIP]

thisisnic opened a new pull request #10141:
URL: https://github.com/apache/arrow/pull/10141


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625079599



##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {
+  
+  tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl_no_dates)
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl_no_dates)
+  
+  tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out2, tbl_no_dates)
+  tbl_in2 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in2, tbl_no_dates)
+  
+  tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out3, tbl_no_dates)
+  tbl_in3 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in3, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out1 <- write_csv_arrow(tbl, csv_file, batch_size = 1)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl)
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl)
+  
+  tbl_out2 <- write_csv_arrow(tbl, csv_file, batch_size = 2)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out2, tbl)
+  tbl_in2 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in2, tbl)
+  
+  tbl_out3 <- write_csv_arrow(tbl, csv_file, batch_size = 12)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out3, tbl)
+  tbl_in3 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in3, tbl)
+  
+})
+
+

Review comment:
       Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625050418



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,

Review comment:
       Updated now




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625050158



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))
+  
+  write_options = CsvWriteOptions$create(include_header, batch_size)

Review comment:
       Updated




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on pull request #10141: ARROW-11787: [R] Implement write csv [WIP]

Posted by GitBox <gi...@apache.org>.

github-actions[bot] commented on pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#issuecomment-825665513


   https://issues.apache.org/jira/browse/ARROW-11787


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] westonpace commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

westonpace commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r623097697



##########
File path: r/R/csv.R
##########
@@ -408,6 +408,34 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
   )
 }
 
+#' @title File writer options
+#' @rdname CsvWriteOptions
+#' @name CsvWriteOptions
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,

Review comment:
       This description doesn't look quite correct.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625050069



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))
+  
+  write_options = CsvWriteOptions$create(include_header, batch_size)
+  
+  x_out <- x
+  if (is.data.frame(x)) {
+    x <- Table$create(x)
+  }
+  
+  assert_is(x, c("Table", "RecordBatch"))

Review comment:
       Updated




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r624050250



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))

Review comment:
       Removed those as totally sensible errors from the C++ as you say.  If `include_header = NA` with the `assert_that` removed, no header is written.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625049722



##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {

Review comment:
       So the output will be the same, but what's happening internally will be different.  I included it as I wanted to make sure I could pass through the param, but I guess it's C++ functionality.  Should I remove the tests for the different batch sizes and just make sure I can pass through the param once?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r624051995



##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")

Review comment:
       Agreed - updated.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nealrichardson closed pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

nealrichardson closed pull request #10141:
URL: https://github.com/apache/arrow/pull/10141


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nealrichardson commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

nealrichardson commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625288659



##########
File path: r/R/csv.R
##########
@@ -585,3 +602,52 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                            sink,
+                            include_header = TRUE,
+                            batch_size = 1024L) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)

Review comment:
       Let's move this to CsvWriteOptions$create(). I like to keep the input validation as close to the C++ call site as possible, just so that if someone uses the lower-level bindings, they still have protection from segfaults.
   ```suggestion
   ```

##########
File path: r/R/csv.R
##########
@@ -408,6 +413,18 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
   )
 }
 
+#' @rdname CsvReadOptions
+#' @export
+CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
+CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L){
+  csv___WriteOptions__initialize(
+    list(
+      include_header = include_header,
+      batch_size = batch_size
+    )
+  )

Review comment:
       `rlang::is_integerish` can handle a few other validation cases we want (length 1, not missing):
   
   ```
   > rlang::is_integerish(1:4)
   [1] TRUE
   > rlang::is_integerish(1:4, n = 1)
   [1] FALSE
   > rlang::is_integerish(NA_integer_)
   [1] TRUE
   > rlang::is_integerish(NA_integer_, finite = TRUE)
   [1] FALSE
   ```
   
   ```suggestion
     assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
     csv___WriteOptions__initialize(
       list(
         include_header = include_header,
         batch_size = as.integer(batch_size)
       )
     )
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r624049352



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)

Review comment:
       I get the below error.  Shall I remove the assertion as it'd handled at the C++ level, or leave it in as it's a cleaner error message with `assert_that`?
   
   ```
   Error: Invalid: Negative buffer resize: -40
   /home/nic2/arrow/cpp/src/arrow/buffer.cc:262  buffer->Resize(size)
   /home/nic2/arrow/cpp/src/arrow/csv/writer.cc:337  AllocateResizableBuffer( options.batch_size * schema_->num_fields() * kColumnSizeGuess, pool_)
   /home/nic2/arrow/cpp/src/arrow/csv/writer.cc:315  PrepareForContentsWrite(options, out)
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] thisisnic commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

thisisnic commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r625050850



##########
File path: r/R/csv.R
##########
@@ -408,6 +408,34 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
   )
 }
 
+#' @title File writer options
+#' @rdname CsvWriteOptions
+#' @name CsvWriteOptions
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,

Review comment:
       Updated now




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] nealrichardson commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

nealrichardson commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r623135520



##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)

Review comment:
       What happens if you remove this and pass a negative batch size?

##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {

Review comment:
       What is this testing? What does batch_size do? It doesn't look like there is an observable difference in the output.

##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {
+  
+  tbl_out1 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 1)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl_no_dates)
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl_no_dates)
+  
+  tbl_out2 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 2)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out2, tbl_no_dates)
+  tbl_in2 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in2, tbl_no_dates)
+  
+  tbl_out3 <- write_csv_arrow(tbl_no_dates, csv_file, batch_size = 12)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out3, tbl_no_dates)
+  tbl_in3 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in3, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out1 <- write_csv_arrow(tbl, csv_file, batch_size = 1)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out1, tbl)
+  tbl_in1 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in1, tbl)
+  
+  tbl_out2 <- write_csv_arrow(tbl, csv_file, batch_size = 2)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out2, tbl)
+  tbl_in2 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in2, tbl)
+  
+  tbl_out3 <- write_csv_arrow(tbl, csv_file, batch_size = 12)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out3, tbl)
+  tbl_in3 <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in3, tbl)
+  
+})
+
+

Review comment:
       You should add tests for handling bad inputs too. Also might make more sense to put the writing tests at the bottom of the test file instead of the top.

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))
+  
+  write_options = CsvWriteOptions$create(include_header, batch_size)
+  
+  x_out <- x
+  if (is.data.frame(x)) {
+    x <- Table$create(x)
+  }
+  
+  assert_is(x, c("Table", "RecordBatch"))

Review comment:
       ```suggestion
     assert_is(x, "ArrowTabular")
   ```

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))
+  
+  write_options = CsvWriteOptions$create(include_header, batch_size)

Review comment:
       ```suggestion
     write_options <- CsvWriteOptions$create(include_header, batch_size)
   ```

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,
+                          include_header = TRUE,
+                          batch_size = 1024L
+                          ) {
+  # Handle and validate options before touching data
+  batch_size <- as.integer(batch_size)
+  assert_that(batch_size > 0)
+  assert_that(length(include_header) == 1)
+  assert_that(is.logical(include_header))

Review comment:
       What happens if you remove these--will the C++ static typing validate this enough?
   
   What happens if `include_header = NA`?

##########
File path: r/R/csv.R
##########
@@ -585,3 +613,55 @@ readr_to_csv_convert_options <- function(na,
     include_columns = include_columns
   )
 }
+
+#' Write CSV file to disk
+#'
+#' @param x `data.frame`, [RecordBatch], or [Table]
+#' @param sink A string file path, URI, or [OutputStream], or path in a file
+#' system (`SubTreeFileSystem`)
+#' @param include_header Whether to write an initial header line with column names
+#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
+#'
+#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
+#' the stream will be left open.
+#' @export
+#' @examples
+#' \donttest{
+#' tf <- tempfile()
+#' on.exit(unlink(tf))
+#' write_csv_arrow(mtcars, tf)
+#' }
+#' @include arrow-package.R
+write_csv_arrow <- function(x,
+                          sink,

Review comment:
       Looks like indentation is slightly off here

##########
File path: r/R/csv.R
##########
@@ -408,6 +408,34 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
   )
 }
 
+#' @title File writer options
+#' @rdname CsvWriteOptions
+#' @name CsvWriteOptions
+#' @docType class
+#' @usage NULL
+#' @format NULL
+#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,

Review comment:
       An alternative to documenting this here (and cleaning up the bad copy-paste) would be to document it with `CsvReadOptions` et al.

##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")

Review comment:
       I don't think you need to test the file-with-dates in every combination of parameters, just the first one is sufficient.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #10141: ARROW-11787: [R] Implement write csv

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #10141:
URL: https://github.com/apache/arrow/pull/10141#discussion_r624618628



##########
File path: r/tests/testthat/test-csv.R
##########
@@ -15,13 +15,101 @@
 # specific language governing permissions and limitations
 # under the License.
 
-context("CsvTableReader")
-
 # Not all types round trip via CSV 100% identical by default
 tbl <- example_data[, c("dbl", "lgl", "false", "chr")]
+tbl_no_dates <- tbl
 # Add a date to test its parsing
 tbl$date <- Sys.Date() + 1:10
 
+csv_file <- tempfile()
+
+test_that("Write a CSV file with header", {
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl_no_dates)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  
+  tbl_in <- read_csv_arrow(csv_file)
+  expect_identical(tbl_in, tbl)
+})
+
+
+test_that("Write a CSV file with no header", {
+  
+  tbl_out <- write_csv_arrow(tbl_no_dates, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl_no_dates)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl_no_dates
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3")
+  
+  expect_identical(tbl_in, tbl_expected)
+  
+  skip("Doesn't yet work with date columns due to ARROW-12540")
+  
+  tbl_out <- write_csv_arrow(tbl, csv_file, include_header = FALSE)
+  expect_true(file.exists(csv_file))
+  expect_identical(tbl_out, tbl)
+  tbl_in <- read_csv_arrow(csv_file, col_names = FALSE)
+  
+  tbl_expected <- tbl
+  names(tbl_expected) <- c("f0", "f1", "f2", "f3", "f4")
+  
+  expect_identical(tbl_in, tbl_expected)
+})
+
+test_that("Write a CSV file with different batch sizes", {

Review comment:
       batch size dictates how much data is buffered when translating to CSV




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org