You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2023/01/18 08:34:25 UTC
[arrow] 09/10: GH-33526: [R] Implement new function open_dataset_csv with signature more closely matching read_csv_arrow (#33614)

This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-11.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 66332515d0fae4529be85351948a2e19f7336f49
Author: Nic Crane <th...@gmail.com>
AuthorDate: Tue Jan 17 18:55:00 2023 +0000

    GH-33526: [R] Implement new function open_dataset_csv with signature more closely matching read_csv_arrow (#33614)
    
    This PR implements a wrapper around `open_dataset()` specifically for value-delimited files. It takes the parameters from `open_dataset()` and appends the parameters of `read_csv_arrow()` which are compatible with `open_dataset()`. This should make it easier for users to switch between the two, e.g.:
    
    ``` r
    library(arrow)
    library(dplyr)
    
    # Set up directory for examples
    tf <- tempfile()
    dir.create(tf)
    on.exit(unlink(tf))
    df <- data.frame(x = c("1", "2", "NULL"))
    
    file_path <- file.path(tf, "file1.txt")
    write.table(df, file_path, sep = ",", row.names = FALSE)
    
    read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
    #> # A tibble: 3 × 1
    #>       y
    #>   <int>
    #> 1     1
    #> 2     2
    #> 3    NA
    
    open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) %>% collect()
    #> # A tibble: 3 × 1
    #>       y
    #>   <int>
    #> 1     1
    #> 2     2
    #> 3    NA
    ```
    
    This PR also hooks up the "na" (readr-style) parameter to "null_values" (i.e. CSVConvertOptions parameter).
    
    In the process of making this PR, I also refactored `CsvFileFormat$create()`.  Unfortunately, many changes needed to be made at once, which has considerably increasing the size/complexity of this PR.
    
    Authored-by: Nic Crane <th...@gmail.com>
    Signed-off-by: Nic Crane <th...@gmail.com>
---
 r/NAMESPACE                         |   3 +
 r/R/csv.R                           |   2 +-
 r/R/dataset-format.R                | 252 ++++++++++++++++++++++++------------
 r/R/dataset.R                       | 124 ++++++++++++++++++
 r/_pkgdown.yml                      |   4 +
 r/man/CsvFileFormat.Rd              |  41 ++++++
 r/man/FileFormat.Rd                 |   3 +-
 r/man/acero.Rd                      |   4 +-
 r/man/open_delim_dataset.Rd         | 216 +++++++++++++++++++++++++++++++
 r/tests/testthat/test-dataset-csv.R |  90 ++++++++++++-
 10 files changed, 648 insertions(+), 91 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index cde81d977b..3df107a2d8 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -348,7 +348,10 @@ export(new_extension_type)
 export(null)
 export(num_range)
 export(one_of)
+export(open_csv_dataset)
 export(open_dataset)
+export(open_delim_dataset)
+export(open_tsv_dataset)
 export(read_csv_arrow)
 export(read_delim_arrow)
 export(read_feather)
diff --git a/r/R/csv.R b/r/R/csv.R
index 08f30fdefd..135394b967 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -500,7 +500,7 @@ CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, nu
   )
 }
 
-readr_to_csv_read_options <- function(skip = 0, col_names = TRUE, col_types = NULL) {
+readr_to_csv_read_options <- function(skip = 0, col_names = TRUE) {
   if (isTRUE(col_names)) {
     # C++ default to parse is 0-length string array
     col_names <- character(0)
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
index 7b88d9b8e0..0912941e64 100644
--- a/r/R/dataset-format.R
+++ b/r/R/dataset-format.R
@@ -53,7 +53,7 @@
 #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
 #' @rdname FileFormat
 #' @name FileFormat
-#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows"
+#' @examplesIf arrow_with_dataset()
 #' ## Semi-colon delimited files
 #' # Set up directory for examples
 #' tf <- tempfile()
@@ -113,107 +113,105 @@ ParquetFileFormat$create <- function(...,
 #' @export
 IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)
 
-#' @usage NULL
-#' @format NULL
-#' @rdname FileFormat
+#' CSV dataset file format
+#'
+#' @description
+#' A `CSVFileFormat` is a [FileFormat] subclass which holds information about how to
+#' read and parse the files included in a CSV `Dataset`.
+#'
+#' @section Factory:
+#' `CSVFileFormat$create()` can take options in the form of lists passed through as `parse_options`,
+#'  `read_options`, or `convert_options` parameters.  Alternatively, readr-style options can be passed
+#'  through individually.  While it is possible to pass in `CSVReadOptions`, `CSVConvertOptions`, and `CSVParseOptions`
+#'  objects, this is not recommended as options set in these objects are not validated for compatibility.
+#'
+#' @return A `CsvFileFormat` object
+#' @rdname CsvFileFormat
+#' @name CsvFileFormat
+#' @seealso [FileFormat]
+#' @examplesIf arrow_with_dataset()
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' on.exit(unlink(tf))
+#' df <- data.frame(x = c("1", "2", "NULL"))
+#' write.table(df, file.path(tf, "file1.txt"), sep = ",", row.names = FALSE)
+#'
+#' # Create CsvFileFormat object with Arrow-style null_values option
+#' format <- CsvFileFormat$create(convert_options = list(null_values = c("", "NA", "NULL")))
+#' open_dataset(tf, format = format)
+#'
+#' # Use readr-style options
+#' format <- CsvFileFormat$create(na = c("", "NA", "NULL"))
+#' open_dataset(tf, format = format)
+#'
 #' @export
 CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
-CsvFileFormat$create <- function(...,
-                                 opts = csv_file_format_parse_options(...),
-                                 convert_options = csv_file_format_convert_opts(...),
-                                 read_options = csv_file_format_read_opts(...)) {
-  check_csv_file_format_args(...)
-  # Evaluate opts first to catch any unsupported arguments
-  force(opts)
-
-  options <- list(...)
-  schema <- options[["schema"]]
-  if (!is.null(schema) && !inherits(schema, "Schema")) {
-    abort(paste0(
-      "`schema` must be an object of class 'Schema' not '",
-      class(schema)[1],
-      "'."
-    ))
-  }
-
-  if (!inherits(read_options, "CsvReadOptions")) {
-    read_options <- do.call(CsvReadOptions$create, read_options)
-  }
+CsvFileFormat$create <- function(...) {
+  dots <- list(...)
+  options <- check_csv_file_format_args(dots)
+  check_schema(options[["schema"]], options[["read_options"]]$column_names)
 
-  if (!inherits(convert_options, "CsvConvertOptions")) {
-    convert_options <- do.call(CsvConvertOptions$create, convert_options)
-  }
-
-  if (!inherits(opts, "CsvParseOptions")) {
-    opts <- do.call(CsvParseOptions$create, opts)
-  }
-
-  column_names <- read_options$column_names
-  schema_names <- names(schema)
+  dataset___CsvFileFormat__Make(options$parse_options, options$convert_options, options$read_options)
+}
 
-  if (!is.null(schema) && !identical(schema_names, column_names)) {
-    missing_from_schema <- setdiff(column_names, schema_names)
-    missing_from_colnames <- setdiff(schema_names, column_names)
-    message_colnames <- NULL
-    message_schema <- NULL
-    message_order <- NULL
+# Check all arguments are valid
+check_csv_file_format_args <- function(args) {
+  options <- list(
+    parse_options = args$parse_options,
+    convert_options = args$convert_options,
+    read_options = args$read_options,
+    schema = args$schema
+  )
 
-    if (length(missing_from_colnames) > 0) {
-      message_colnames <- paste(
-        oxford_paste(missing_from_colnames, quote_symbol = "`"),
-        "not present in `column_names`"
-      )
-    }
+  check_unsupported_args(args)
+  check_unrecognised_args(args)
 
-    if (length(missing_from_schema) > 0) {
-      message_schema <- paste(
-        oxford_paste(missing_from_schema, quote_symbol = "`"),
-        "not present in `schema`"
-      )
-    }
+  # Evaluate parse_options first to catch any unsupported arguments
+  if (is.null(args$parse_options)) {
+    options$parse_options <- do.call(csv_file_format_parse_opts, args)
+  } else if (is.list(args$parse_options)) {
+    options$parse_options <- do.call(CsvParseOptions$create, args$parse_options)
+  }
 
-    if (length(missing_from_schema) == 0 && length(missing_from_colnames) == 0) {
-      message_order <- "`column_names` and `schema` field names match but are not in the same order"
-    }
+  if (is.null(args$convert_options)) {
+    options$convert_options <- do.call(csv_file_format_convert_opts, args)
+  } else if (is.list(args$convert_options)) {
+    options$convert_options <- do.call(CsvConvertOptions$create, args$convert_options)
+  }
 
-    abort(
-      c(
-        "Values in `column_names` must match `schema` field names",
-        x = message_order,
-        x = message_schema,
-        x = message_colnames
-      )
-    )
+  if (is.null(args$read_options)) {
+    options$read_options <- do.call(csv_file_format_read_opts, args)
+  } else if (is.list(args$read_options)) {
+    options$read_options <- do.call(CsvReadOptions$create, args$read_options)
   }
 
-  dataset___CsvFileFormat__Make(opts, convert_options, read_options)
+  options
 }
 
-# Check all arguments are valid
-check_csv_file_format_args <- function(...) {
-  opts <- list(...)
+check_unsupported_args <- function(args) {
+  opt_names <- get_opt_names(args)
+
   # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
-  convert_opts <- c(names(formals(CsvConvertOptions$create)))
+  supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na")
 
-  read_opts <- c(
+  supported_read_opts <- c(
     names(formals(CsvReadOptions$create)),
     names(formals(readr_to_csv_read_options))
   )
 
   # We only currently support all of the readr options for parseoptions
-  parse_opts <- c(
+  supported_parse_opts <- c(
     names(formals(CsvParseOptions$create)),
     names(formals(readr_to_csv_parse_options))
   )
 
-  opt_names <- names(opts)
-
   # Catch any readr-style options specified with full option names that are
   # supported by read_delim_arrow() (and its wrappers) but are not yet
   # supported here
   unsup_readr_opts <- setdiff(
     names(formals(read_delim_arrow)),
-    c(convert_opts, read_opts, parse_opts, "schema")
+    c(supported_convert_opts, supported_read_opts, supported_parse_opts, "schema")
   )
 
   is_unsup_opt <- opt_names %in% unsup_readr_opts
@@ -228,9 +226,36 @@ check_csv_file_format_args <- function(...) {
       call. = FALSE
     )
   }
+}
+
+# unlists "parse_options", "convert_options", "read_options" and returns them along with
+# names of options passed in individually via args.  `get_opt_names()` ignores any
+# CSV*Options objects passed in as these are not validated - users must ensure they've
+# chosen reasonable values in this case.
+get_opt_names <- function(args) {
+  opt_names <- names(args)
+
+  # extract names of parse_options, read_options, and convert_options
+  if ("parse_options" %in% names(args) && is.list(args[["parse_options"]])) {
+    opt_names <- c(opt_names, names(args[["parse_options"]]))
+  }
+
+  if ("read_options" %in% names(args) && is.list(args[["read_options"]])) {
+    opt_names <- c(opt_names, names(args[["read_options"]]))
+  }
 
+  if ("convert_options" %in% names(args) && is.list(args[["convert_options"]])) {
+    opt_names <- c(opt_names, names(args[["convert_options"]]))
+  }
+
+  setdiff(opt_names, c("parse_options", "read_options", "convert_options"))
+}
+
+check_unrecognised_args <- function(opts) {
   # Catch any options with full or partial names that do not match any of the
   # recognized Arrow C++ option names or readr-style option names
+  opt_names <- get_opt_names(opts)
+
   arrow_opts <- c(
     names(formals(CsvParseOptions$create)),
     names(formals(CsvReadOptions$create)),
@@ -240,7 +265,8 @@ check_csv_file_format_args <- function(...) {
 
   readr_opts <- c(
     names(formals(readr_to_csv_parse_options)),
-    names(formals(readr_to_csv_read_options))
+    names(formals(readr_to_csv_read_options)),
+    "na"
   )
 
   is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
@@ -271,26 +297,74 @@ check_ambiguous_options <- function(passed_opts, opts1, opts2) {
   }
 }
 
+check_schema <- function(schema, column_names) {
+  if (!is.null(schema) && !inherits(schema, "Schema")) {
+    abort(paste0(
+      "`schema` must be an object of class 'Schema' not '",
+      class(schema)[1],
+      "'."
+    ))
+  }
+
+  schema_names <- names(schema)
+
+  if (!is.null(schema) && !identical(schema_names, column_names)) {
+    missing_from_schema <- setdiff(column_names, schema_names)
+    missing_from_colnames <- setdiff(schema_names, column_names)
+    message_colnames <- NULL
+    message_schema <- NULL
+    message_order <- NULL
+
+    if (length(missing_from_colnames) > 0) {
+      message_colnames <- paste(
+        oxford_paste(missing_from_colnames, quote_symbol = "`"),
+        "not present in `column_names`"
+      )
+    }
+
+    if (length(missing_from_schema) > 0) {
+      message_schema <- paste(
+        oxford_paste(missing_from_schema, quote_symbol = "`"),
+        "not present in `schema`"
+      )
+    }
+
+    if (length(missing_from_schema) == 0 && length(missing_from_colnames) == 0) {
+      message_order <- "`column_names` and `schema` field names match but are not in the same order"
+    }
+
+    abort(
+      c(
+        "Values in `column_names` must match `schema` field names",
+        x = message_order,
+        x = message_schema,
+        x = message_colnames
+      )
+    )
+  }
+}
+
 # Support both readr-style option names and Arrow C++ option names
-csv_file_format_parse_options <- function(...) {
+csv_file_format_parse_opts <- function(...) {
   opts <- list(...)
   # Filter out arguments meant for CsvConvertOptions/CsvReadOptions
-  convert_opts <- names(formals(CsvConvertOptions$create))
+  convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options")
   read_opts <- c(
     names(formals(CsvReadOptions$create)),
-    names(formals(readr_to_csv_read_options))
+    names(formals(readr_to_csv_read_options)),
+    "read_options"
   )
   opts[convert_opts] <- NULL
   opts[read_opts] <- NULL
   opts[["schema"]] <- NULL
-  opt_names <- names(opts)
+  opts[["parse_options"]] <- NULL
+  opt_names <- get_opt_names(opts)
 
   arrow_opts <- c(names(formals(CsvParseOptions$create)))
   readr_opts <- c(names(formals(readr_to_csv_parse_options)))
 
   is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
   is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))
-
   # Catch options with ambiguous partial names (such as "del") that make it
   # unclear whether the user is specifying Arrow C++ options ("delimiter") or
   # readr-style options ("delim")
@@ -313,28 +387,38 @@ csv_file_format_parse_options <- function(...) {
 csv_file_format_convert_opts <- function(...) {
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvReadOptions
-  arrow_opts <- names(formals(CsvParseOptions$create))
+  arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
   readr_opts <- names(formals(readr_to_csv_parse_options))
   read_opts <- c(
     names(formals(CsvReadOptions$create)),
-    names(formals(readr_to_csv_read_options))
+    names(formals(readr_to_csv_read_options)),
+    "read_options"
   )
   opts[arrow_opts] <- NULL
   opts[readr_opts] <- NULL
   opts[read_opts] <- NULL
   opts[["schema"]] <- NULL
+  opts[["convert_options"]] <- NULL
+
+  # map "na" to "null_values"
+  if ("na" %in% names(opts)) {
+    opts[["null_values"]] <- opts[["na"]]
+    opts[["na"]] <- NULL
+  }
+
   do.call(CsvConvertOptions$create, opts)
 }
 
 csv_file_format_read_opts <- function(schema = NULL, ...) {
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
-  arrow_opts <- names(formals(CsvParseOptions$create))
+  arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
   readr_opts <- names(formals(readr_to_csv_parse_options))
-  convert_opts <- names(formals(CsvConvertOptions$create))
+  convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options")
   opts[arrow_opts] <- NULL
   opts[readr_opts] <- NULL
   opts[convert_opts] <- NULL
+  opts[["read_options"]] <- NULL
 
   opt_names <- names(opts)
   arrow_opts <- c(names(formals(CsvReadOptions$create)))
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 732c05ecb0..71247b3581 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -228,6 +228,130 @@ open_dataset <- function(sources,
   )
 }
 
+#' Open a multi-file dataset of CSV or other delimiter-separated format
+#'
+#' A wrapper around [open_dataset] which explicitly includes parameters mirroring [read_csv_arrow()],
+#' [read_delim_arrow()], and [read_tsv_arrow()] to allows for easy switching between functions
+#' for opening single files and functions for opening datasets.
+#'
+#' @inheritParams open_dataset
+#' @inheritParams read_delim_arrow
+#'
+#' @section Options currently supported by [read_delim_arrow()] which are not supported here:
+#' * `file` (instead, please specify files in `sources`)
+#' * `col_select` (instead, subset columns after dataset creation)
+#' * `quoted_na`
+#' * `as_data_frame` (instead, convert to data frame after dataset creation)
+#' * `parse_options`
+#'
+#' @examplesIf arrow_with_dataset()
+#' # Set up directory for examples
+#' tf <- tempfile()
+#' dir.create(tf)
+#' df <- data.frame(x = c("1", "2", "NULL"))
+#'
+#' file_path <- file.path(tf, "file1.txt")
+#' write.table(df, file_path, sep = ",", row.names = FALSE)
+#'
+#' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
+#'
+#' unlink(tf)
+#' @seealso [open_dataset()]
+#' @export
+open_delim_dataset <- function(sources,
+                               schema = NULL,
+                               partitioning = hive_partition(),
+                               hive_style = NA,
+                               unify_schemas = NULL,
+                               factory_options = list(),
+                               delim = ",",
+                               quote = "\"",
+                               escape_double = TRUE,
+                               escape_backslash = FALSE,
+                               col_names = TRUE,
+                               col_types = NULL,
+                               na = c("", "NA"),
+                               skip_empty_rows = TRUE,
+                               skip = 0L,
+                               convert_options = NULL,
+                               read_options = NULL,
+                               timestamp_parsers = NULL) {
+  open_dataset(
+    sources = sources,
+    schema = schema,
+    partitioning = partitioning,
+    hive_style = hive_style,
+    unify_schemas = unify_schemas,
+    factory_options = factory_options,
+    format = "text",
+    delim = delim,
+    quote = quote,
+    escape_double = escape_double,
+    escape_backslash = escape_backslash,
+    col_names = col_names,
+    col_types = col_types,
+    na = na,
+    skip_empty_rows = skip_empty_rows,
+    skip = skip,
+    convert_options = convert_options,
+    read_options = read_options,
+    timestamp_parsers = timestamp_parsers
+  )
+}
+
+#' @rdname open_delim_dataset
+#' @export
+open_csv_dataset <- function(sources,
+                             schema = NULL,
+                             partitioning = hive_partition(),
+                             hive_style = NA,
+                             unify_schemas = NULL,
+                             factory_options = list(),
+                             quote = "\"",
+                             escape_double = TRUE,
+                             escape_backslash = FALSE,
+                             col_names = TRUE,
+                             col_types = NULL,
+                             na = c("", "NA"),
+                             skip_empty_rows = TRUE,
+                             skip = 0L,
+                             convert_options = NULL,
+                             read_options = NULL,
+                             timestamp_parsers = NULL) {
+  mc <- match.call()
+  mc$delim <- ","
+  mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow"))
+  eval.parent(mc)
+}
+
+#' @rdname open_delim_dataset
+#' @export
+open_tsv_dataset <- function(sources,
+                             schema = NULL,
+                             partitioning = hive_partition(),
+                             hive_style = NA,
+                             unify_schemas = NULL,
+                             factory_options = list(),
+                             quote = "\"",
+                             escape_double = TRUE,
+                             escape_backslash = FALSE,
+                             col_names = TRUE,
+                             col_types = NULL,
+                             na = c("", "NA"),
+                             skip_empty_rows = TRUE,
+                             skip = 0L,
+                             convert_options = NULL,
+                             read_options = NULL,
+                             timestamp_parsers = NULL) {
+  mc <- match.call()
+  mc$delim <- "\t"
+  mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow"))
+  eval.parent(mc)
+}
+
+
+
 #' Multi-file datasets
 #'
 #' @description
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 3d5dc2d1f2..391d340769 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -141,6 +141,9 @@ reference:
   - title: Multi-file datasets
     contents:
       - open_dataset
+      - open_delim_dataset
+      - open_csv_dataset
+      - open_tsv_dataset
       - write_dataset
       - dataset_factory
       - hive_partition
@@ -149,6 +152,7 @@ reference:
       - Expression
       - Scanner
       - FileFormat
+      - CsvFileFormat
       - FileWriteOptions
       - FragmentScanOptions
       - map_batches
diff --git a/r/man/CsvFileFormat.Rd b/r/man/CsvFileFormat.Rd
new file mode 100644
index 0000000000..aa368b8f29
--- /dev/null
+++ b/r/man/CsvFileFormat.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset-format.R
+\name{CsvFileFormat}
+\alias{CsvFileFormat}
+\title{CSV dataset file format}
+\value{
+A \code{CsvFileFormat} object
+}
+\description{
+A \code{CSVFileFormat} is a \link{FileFormat} subclass which holds information about how to
+read and parse the files included in a CSV \code{Dataset}.
+}
+\section{Factory}{
+
+\code{CSVFileFormat$create()} can take options in the form of lists passed through as \code{parse_options},
+\code{read_options}, or \code{convert_options} parameters.  Alternatively, readr-style options can be passed
+through individually.  While it is possible to pass in \code{CSVReadOptions}, \code{CSVConvertOptions}, and \code{CSVParseOptions}
+objects, this is not recommended as options set in these objects are not validated for compatibility.
+}
+
+\examples{
+\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Set up directory for examples
+tf <- tempfile()
+dir.create(tf)
+on.exit(unlink(tf))
+df <- data.frame(x = c("1", "2", "NULL"))
+write.table(df, file.path(tf, "file1.txt"), sep = ",", row.names = FALSE)
+
+# Create CsvFileFormat object with Arrow-style null_values option
+format <- CsvFileFormat$create(convert_options = list(null_values = c("", "NA", "NULL")))
+open_dataset(tf, format = format)
+
+# Use readr-style options
+format <- CsvFileFormat$create(na = c("", "NA", "NULL"))
+open_dataset(tf, format = format)
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\link{FileFormat}
+}
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index 3c6fd330b0..296de02ead 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -4,7 +4,6 @@
 \alias{FileFormat}
 \alias{ParquetFileFormat}
 \alias{IpcFileFormat}
-\alias{CsvFileFormat}
 \title{Dataset file formats}
 \description{
 A \code{FileFormat} holds information about how to read and parse the files
@@ -52,7 +51,7 @@ It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFile
 }
 
 \examples{
-\dontshow{if (arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 ## Semi-colon delimited files
 # Set up directory for examples
 tf <- tempfile()
diff --git a/r/man/acero.Rd b/r/man/acero.Rd
index a54b210e18..b8aed28825 100644
--- a/r/man/acero.Rd
+++ b/r/man/acero.Rd
@@ -53,9 +53,9 @@ Table into an R \code{data.frame}.
 \item \code{\link[dplyr:slice]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating
 \item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and `.groups = "rowwise" not supported
 \item \code{\link[dplyr:count]{tally()}}
-\item \code{\link[dplyr:mutate]{transmute()}}
+\item \code{\link[dplyr:transmute]{transmute()}}
 \item \code{\link[dplyr:group_by]{ungroup()}}
-\item \code{\link[dplyr:reexports]{union()}}
+\item \code{\link[dplyr:setops]{union()}}
 \item \code{\link[dplyr:setops]{union_all()}}
 }
 }
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
new file mode 100644
index 0000000000..d127f772c6
--- /dev/null
+++ b/r/man/open_delim_dataset.Rd
@@ -0,0 +1,216 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataset.R
+\name{open_delim_dataset}
+\alias{open_delim_dataset}
+\alias{open_csv_dataset}
+\alias{open_tsv_dataset}
+\title{Open a multi-file dataset of CSV or other delimiter-separated format}
+\usage{
+open_delim_dataset(
+  sources,
+  schema = NULL,
+  partitioning = hive_partition(),
+  hive_style = NA,
+  unify_schemas = NULL,
+  factory_options = list(),
+  delim = ",",
+  quote = "\\"",
+  escape_double = TRUE,
+  escape_backslash = FALSE,
+  col_names = TRUE,
+  col_types = NULL,
+  na = c("", "NA"),
+  skip_empty_rows = TRUE,
+  skip = 0L,
+  convert_options = NULL,
+  read_options = NULL,
+  timestamp_parsers = NULL
+)
+
+open_csv_dataset(
+  sources,
+  schema = NULL,
+  partitioning = hive_partition(),
+  hive_style = NA,
+  unify_schemas = NULL,
+  factory_options = list(),
+  quote = "\\"",
+  escape_double = TRUE,
+  escape_backslash = FALSE,
+  col_names = TRUE,
+  col_types = NULL,
+  na = c("", "NA"),
+  skip_empty_rows = TRUE,
+  skip = 0L,
+  convert_options = NULL,
+  read_options = NULL,
+  timestamp_parsers = NULL
+)
+
+open_tsv_dataset(
+  sources,
+  schema = NULL,
+  partitioning = hive_partition(),
+  hive_style = NA,
+  unify_schemas = NULL,
+  factory_options = list(),
+  quote = "\\"",
+  escape_double = TRUE,
+  escape_backslash = FALSE,
+  col_names = TRUE,
+  col_types = NULL,
+  na = c("", "NA"),
+  skip_empty_rows = TRUE,
+  skip = 0L,
+  convert_options = NULL,
+  read_options = NULL,
+  timestamp_parsers = NULL
+)
+}
+\arguments{
+\item{sources}{One of:
+\itemize{
+\item a string path or URI to a directory containing data files
+\item a \link{FileSystem} that references a directory containing data files
+(such as what is returned by \code{\link[=s3_bucket]{s3_bucket()}})
+\item a string path or URI to a single file
+\item a character vector of paths or URIs to individual data files
+\item a list of \code{Dataset} objects as created by this function
+\item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}.
+}
+
+When \code{sources} is a vector of file URIs, they must all use the same protocol
+and point to files located in the same file system and having the same
+format.}
+
+\item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema
+will be inferred from the data sources.}
+
+\item{partitioning}{When \code{sources} is a directory path/URI, one of:
+\itemize{
+\item a \code{Schema}, in which case the file paths relative to \code{sources} will be
+parsed, and path segments will be matched with the schema fields.
+\item a character vector that defines the field names corresponding to those
+path segments (that is, you're providing the names that would correspond
+to a \code{Schema} but the types will be autodetected)
+\item a \code{Partitioning} or \code{PartitioningFactory}, such as returned
+by \code{\link[=hive_partition]{hive_partition()}}
+\item \code{NULL} for no partitioning
+}
+
+The default is to autodetect Hive-style partitions unless
+\code{hive_style = FALSE}. See the "Partitioning" section for details.
+When \code{sources} is not a directory path/URI, \code{partitioning} is ignored.}
+
+\item{hive_style}{Logical: should \code{partitioning} be interpreted as
+Hive-style? Default is \code{NA}, which means to inspect the file paths for
+Hive-style partitioning and behave accordingly.}
+
+\item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s)
+be scanned in order to create a unified schema from them? If \code{FALSE}, only
+the first fragment will be inspected for its schema. Use this fast path
+when you know and trust that all fragments have an identical schema.
+The default is \code{FALSE} when creating a dataset from a directory path/URI or
+vector of file paths/URIs (because there may be many files and scanning may
+be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there
+should be few \code{Dataset}s in the list and their \code{Schema}s are already in
+memory).}
+
+\item{factory_options}{list of optional FileSystemFactoryOptions:
+\itemize{
+\item \code{partition_base_dir}: string path segment prefix to ignore when
+discovering partition information with DirectoryPartitioning. Not
+meaningful (ignored with a warning) for HivePartitioning, nor is it
+valid when providing a vector of file paths.
+\item \code{exclude_invalid_files}: logical: should files that are not valid data
+files be excluded? Default is \code{FALSE} because checking all files up
+front incurs I/O and thus will be slower, especially on remote
+filesystems. If false and there are invalid files, there will be an
+error at scan time. This is the only FileSystemFactoryOption that is
+valid for both when providing a directory path in which to discover
+files and when providing a vector of file paths.
+\item \code{selector_ignore_prefixes}: character vector of file prefixes to ignore
+when discovering files in a directory. If invalid files can be excluded
+by a common filename prefix this way, you can avoid the I/O cost of
+\code{exclude_invalid_files}. Not valid when providing a vector of file paths
+(but if you're providing the file list, you can filter invalid files
+yourself).
+}}
+
+\item{delim}{Single character used to separate fields within a record.}
+
+\item{quote}{Single character used to quote strings.}
+
+\item{escape_double}{Does the file escape quotes by doubling them?
+i.e. If this option is \code{TRUE}, the value \verb{""""} represents
+a single quote, \verb{\\"}.}
+
+\item{escape_backslash}{Does the file use backslashes to escape special
+characters? This is more general than \code{escape_double} as backslashes
+can be used to escape the delimiter character, the quote character, or
+to add special characters like \verb{\\\\n}.}
+
+\item{col_names}{If \code{TRUE}, the first row of the input will be used as the
+column names and will not be included in the data frame. If \code{FALSE}, column
+names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
+Alternatively, you can specify a character vector of column names.}
+
+\item{col_types}{A compact string representation of the column types,
+an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the data.}
+
+\item{na}{A character vector of strings to interpret as missing values.}
+
+\item{skip_empty_rows}{Should blank rows be ignored altogether? If
+\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be
+filled with missings.}
+
+\item{skip}{Number of lines to skip before reading data.}
+
+\item{convert_options}{see \link[=CsvReadOptions]{file reader options}}
+
+\item{read_options}{see \link[=CsvReadOptions]{file reader options}}
+
+\item{timestamp_parsers}{User-defined timestamp parsers. If more than one
+parser is specified, the CSV conversion logic will try parsing values
+starting from the beginning of this vector. Possible values are:
+\itemize{
+\item \code{NULL}: the default, which uses the ISO-8601 parser
+\item a character vector of \link[base:strptime]{strptime} parse strings
+\item a list of \link{TimestampParser} objects
+}}
+}
+\description{
+A wrapper around \link{open_dataset} which explicitly includes parameters mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}},
+\code{\link[=read_delim_arrow]{read_delim_arrow()}}, and \code{\link[=read_tsv_arrow]{read_tsv_arrow()}} to allows for easy switching between functions
+for opening single files and functions for opening datasets.
+}
+\section{Options currently supported by \code{\link[=read_delim_arrow]{read_delim_arrow()}} which are not supported here}{
+
+\itemize{
+\item \code{file} (instead, please specify files in \code{sources})
+\item \code{col_select} (instead, subset columns after dataset creation)
+\item \code{quoted_na}
+\item \code{as_data_frame} (instead, convert to data frame after dataset creation)
+\item \code{parse_options}
+}
+}
+
+\examples{
+\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Set up directory for examples
+tf <- tempfile()
+dir.create(tf)
+df <- data.frame(x = c("1", "2", "NULL"))
+
+file_path <- file.path(tf, "file1.txt")
+write.table(df, file_path, sep = ",", row.names = FALSE)
+
+read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
+open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
+
+unlink(tf)
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\code{\link[=open_dataset]{open_dataset()}}
+}
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
index 436db985fb..b25c57b2ba 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -218,9 +218,9 @@ test_that("readr parse options", {
     character(0)
   )
 
-  # With not yet supported readr parse options (ARROW-8631)
+  # With not yet supported readr parse options
   expect_error(
-    open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"),
+    open_dataset(tsv_dir, partitioning = "part", delim = "\t", quoted_na = TRUE),
     "supported"
   )
 
@@ -476,3 +476,89 @@ test_that("CSV reading/parsing/convert options can be passed in as lists", {
 
   expect_equal(ds1, ds2)
 })
+
+test_that("open_delim_dataset params passed through to open_dataset", {
+  ds <- open_delim_dataset(csv_dir, delim = ",", partitioning = "part")
+  expect_r6_class(ds$format, "CsvFileFormat")
+  expect_r6_class(ds$filesystem, "LocalFileSystem")
+  expect_identical(names(ds), c(names(df1), "part"))
+  expect_identical(dim(ds), c(20L, 7L))
+
+  # quote
+  dst_dir <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "data.csv")
+
+  df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'"))
+  write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+  ds_quote <- open_csv_dataset(dst_dir, quote = "'") %>% collect()
+  expect_equal(ds_quote$b, c("abc", "def"))
+
+  # na
+  ds <- open_csv_dataset(csv_dir, partitioning = "part", na = c("", "NA", "FALSE")) %>% collect()
+  expect_identical(ds$lgl, c(
+    TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA,
+    TRUE, NA, TRUE, NA, NA, TRUE, NA
+  ))
+
+  # col_names and skip
+  ds <- open_csv_dataset(
+    csv_dir,
+    partitioning = "part",
+    col_names = paste0("col_", 1:6),
+    skip = 1
+  ) %>% collect()
+
+  expect_named(ds, c("col_1", "col_2", "col_3", "col_4", "col_5", "col_6", "part"))
+  expect_equal(nrow(ds), 20)
+
+  # col_types
+  dst_dir <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "data.csv")
+
+  df <- data.frame(a = c(1, NA, 2), b = c("'abc'", NA, "'def'"))
+  write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+  data_schema <- schema(a = string(), b = string())
+  ds_strings <- open_csv_dataset(dst_dir, col_types = data_schema)
+  expect_equal(ds_strings$schema, schema(a = string(), b = string()))
+
+  # skip_empty_rows
+  tf <- tempfile()
+  writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n\n', tf)
+
+  ds <- open_csv_dataset(tf, skip_empty_rows = FALSE) %>% collect()
+  expect_equal(nrow(ds), 7)
+
+  # convert_options
+  ds <- open_csv_dataset(
+    csv_dir,
+    convert_options = list(null_values = c("NA", "", "FALSE"), strings_can_be_null = TRUE)
+  ) %>% collect()
+
+  expect_equal(
+    ds$lgl,
+    c(TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA)
+  )
+
+  # read_options
+  ds <- open_csv_dataset(
+    csv_dir,
+    read_options = list(column_names = paste0("col_", 1:6))
+  ) %>% collect()
+
+  expect_named(ds, c("col_1", "col_2", "col_3", "col_4", "col_5", "col_6"))
+
+  # timestamp_parsers
+  skip("GH-33708: timestamp_parsers don't appear to be working properly")
+
+  dst_dir <- make_temp_dir()
+  dst_file <- file.path(dst_dir, "data.csv")
+
+  df <- data.frame(time = "2023-01-16 19:47:57")
+  write.csv(df, dst_file, row.names = FALSE, quote = FALSE)
+
+  ds <- open_csv_dataset(dst_dir, timestamp_parsers = c(TimestampParser$create(format = "%d-%m-%y"))) %>% collect()
+
+  expect_equal(ds$time, "16-01-2023")
+})