You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/27 22:24:54 UTC
[arrow] branch master updated: ARROW-5500: [R] read_csv_arrow()
signature should match readr::read_csv()
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d8b3be9 ARROW-5500: [R] read_csv_arrow() signature should match readr::read_csv()
d8b3be9 is described below
commit d8b3be9069f6172e41d5f1dda05ab37810d900ce
Author: Neal Richardson <ne...@gmail.com>
AuthorDate: Thu Jun 27 17:24:41 2019 -0500
ARROW-5500: [R] read_csv_arrow() signature should match readr::read_csv()
This patch enumerates the various CSV parsing options and exposes them in an R-familiar way in the signature of `read_csv_arrow()`. It also adds a generic `read_delim_arrow()` for providing other delimiting characters, as well as a `read_tsv_arrow()`. In the process, I identified some limitations of the current reader (https://issues.apache.org/jira/browse/ARROW-5747) and of the R bindings to it (not yet ticketed), and added more docs and tests.
Other release-prep cleanup in here includes organization of the DESCRIPTION file, adding new functions to the pkgdown config, and adding a NEWS.md.
Author: Neal Richardson <ne...@gmail.com>
Closes #4711 from nealrichardson/readr-csv and squashes the following commits:
92b0a2788 <Neal Richardson> :rat:
22268d960 <Neal Richardson> Rename man topic in pkgdown.yml
fc156e3e8 <Neal Richardson> Doc :nailcare:, add read_delim_arrow and read_tsv_arrow
fb75af1fa <Neal Richardson> More docs and tests for csv parse options; skip a few that aren't supported
8e2fa2d9a <Neal Richardson> Some cleanup of pkgdown site prep and DESCRIPTION. Start on implementing readr::read_csv arguments
---
r/DESCRIPTION | 28 ++---
r/NAMESPACE | 2 +
r/NEWS.md | 26 ++++
r/R/csv.R | 259 +++++++++++++++++++++++++++++++-------
r/_pkgdown.yml | 15 ++-
r/man/arrow-package.Rd | 1 -
r/man/csv_parse_options.Rd | 4 +-
r/man/csv_table_reader.Rd | 12 +-
r/man/read_csv_arrow.Rd | 27 ----
r/man/read_delim_arrow.Rd | 79 ++++++++++++
r/tests/testthat/test-arrow-csv.R | 87 +++++++++++--
11 files changed, 442 insertions(+), 98 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 45edda1..47eccc8 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -3,10 +3,10 @@ Title: Integration to 'Apache' 'Arrow'
Version: 0.13.0.9000
Authors@R: c(
person("Romain", "Fran\u00e7ois", email = "romain@rstudio.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2444-4226")),
- person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")),
- person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")),
person("Jeroen", "Ooms", email = "jeroen@berkeley.edu", role = c("aut")),
person("Neal", "Richardson", email = "neal@ursalabs.org", role = c("aut")),
+ person("Javier", "Luraschi", email = "javier@rstudio.com", role = c("ctb")),
+ person("Jeffrey", "Wong", email = "jeffreyw@netflix.com", role = c("ctb")),
person("Apache Arrow", email = "dev@arrow.apache.org", role = c("aut", "cph"))
)
Description: 'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
@@ -16,7 +16,7 @@ Description: 'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
package provides an interface to the Arrow C++ library.
Depends: R (>= 3.1)
License: Apache License (>= 2.0)
-URL: https://arrow.apache.org/docs/r/, https://github.com/apache/arrow/
+URL: https://github.com/apache/arrow/
BugReports: https://issues.apache.org/jira/projects/ARROW/issues
Encoding: UTF-8
LazyData: true
@@ -24,27 +24,27 @@ SystemRequirements: C++11
LinkingTo:
Rcpp (>= 1.0.1)
Imports:
- utils,
- Rcpp (>= 1.0.1),
- rlang,
- purrr,
assertthat,
- R6,
- fs,
bit64,
- tidyselect
+ fs,
+ purrr,
+ R6,
+ Rcpp (>= 1.0.1),
+ rlang,
+ tidyselect,
+ utils
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1
Suggests:
- tibble,
covr,
+ hms,
+ lubridate,
pkgdown,
rmarkdown,
roxygen2,
testthat,
- lubridate,
- vctrs,
- hms
+ tibble,
+ vctrs
Collate:
'enums.R'
'R6.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index e82b30a..e4b367d 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -162,6 +162,7 @@ export(parquet_arrow_reader_properties)
export(parquet_file_reader)
export(read_arrow)
export(read_csv_arrow)
+export(read_delim_arrow)
export(read_feather)
export(read_json_arrow)
export(read_message)
@@ -169,6 +170,7 @@ export(read_parquet)
export(read_record_batch)
export(read_schema)
export(read_table)
+export(read_tsv_arrow)
export(record_batch)
export(schema)
export(starts_with)
diff --git a/r/NEWS.md b/r/NEWS.md
new file mode 100644
index 0000000..fa6b25a
--- /dev/null
+++ b/r/NEWS.md
@@ -0,0 +1,26 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# arrow 0.13.0.9000
+
+Initial CRAN release of the `arrow` package. Key features include:
+
+* Read and write support for various file formats, including Parquet, Feather/Arrow, CSV, and JSON.
+* API bindings to the C++ library for Arrow data types and objects, as well as mapping between Arrow types and R data types.
+* Tools for helping with C++ library configuration and installation.
diff --git a/r/R/csv.R b/r/R/csv.R
index 03a4b7d..8f4370a 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -15,6 +15,168 @@
# specific language governing permissions and limitations
# under the License.
+#' Read a CSV or other delimited file with Arrow
+#'
+#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`.
+#' Arrow C++ options have been mapped to argument names that follow those of
+#' [readr::read_delim()], and `col_select` was inspired by [vroom::vroom()].
+#'
+#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
+#' `read_delim_arrow()` that specify a delimiter.
+#'
+#' Note that not all `readr` options are currently implemented here. Please file
+#' an issue if you encounter one that `arrow` should support.
+#'
+#' If you need to control Arrow-specific reader parameters that don't have an
+#' equivalent in `readr::read_csv()`, you can either provide them in the
+#' `parse_options`, `convert_options`, or `read_options` arguments, or you can
+#' call [csv_table_reader()] directly for lower-level access.
+#'
+#' @param file A character path to a local file, or an Arrow input stream
+#' @param delim Single character used to separate fields within a record.
+#' @param quote Single character used to quote strings.
+#' @param escape_double Does the file escape quotes by doubling them?
+#' i.e. If this option is `TRUE`, the value `""""` represents
+#' a single quote, `\"`.
+#' @param escape_backslash Does the file use backslashes to escape special
+#' characters? This is more general than `escape_double` as backslashes
+#' can be used to escape the delimiter character, the quote character, or
+#' to add special characters like `\\n`.
+# #' @param col_names If `TRUE`, the first row of the input will be used as the
+# #' column names and will not be included in the data frame. Note that `FALSE`
+# #' is not currently supported, nor is specifying a character vector of column
+# #' names.
+#' @param col_select A [tidy selection specification][tidyselect::vars_select]
+#' of columns, as used in `dplyr::select()`.
+#' @param skip_empty_rows Should blank rows be ignored altogether? If
+#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
+#' filled with missings.
+# #' @param skip Number of lines to skip before reading data.
+#' @param parse_options see [csv_parse_options()]. If given, this overrides any
+#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
+#' @param convert_options see [csv_convert_options()]
+#' @param read_options see [csv_read_options()]
+#' @param as_tibble Should the function return a `data.frame` or an
+#' [arrow::Table][arrow__Table]?
+#'
+#' @return A `data.frame`, or an `arrow::Table` if `as_tibble = FALSE`.
+#' @export
+read_delim_arrow <- function(file,
+ delim = ",",
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ # col_names = TRUE,
+ # col_types = TRUE,
+ col_select = NULL,
+ # na = c("", "NA"),
+ # quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ # skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = csv_read_options(),
+ as_tibble = TRUE) {
+
+ # These are hardcoded pending https://issues.apache.org/jira/browse/ARROW-5747
+ col_names <- TRUE
+ skip <- 0L
+
+ if (is.null(parse_options)) {
+ if (isTRUE(col_names)) {
+ # Add one row to skip, to match arrow's header_rows
+ skip <- skip + 1L
+ # Note that with the hardcoding, header_rows is always 1, which
+ # turns out to be the only value that works meaningfully
+ }
+ parse_options <- readr_to_csv_parse_options(
+ delim,
+ quote,
+ escape_double,
+ escape_backslash,
+ skip_empty_rows,
+ skip
+ )
+ }
+
+ if (is.null(convert_options)) {
+ # TODO:
+ # * na strings (needs wiring in csv_convert_options)
+ # * col_types (needs wiring in csv_convert_options). Note that we can't do
+ # col_types if col_names is strings because the column type specification
+ # requires a map of name: type, but the CSV reader doesn't handle user-
+ # provided names--they're renamed after the fact.
+ convert_options <- csv_convert_options()
+ }
+
+ reader <- csv_table_reader(
+ file,
+ read_options = read_options,
+ parse_options = parse_options,
+ convert_options = convert_options
+ )
+
+ tab <- reader$Read()$select(!!enquo(col_select))
+ if (is.character(col_names)) {
+ # TODO: Rename `tab`'s columns
+ # See https://github.com/apache/arrow/pull/4557
+ }
+
+ if (isTRUE(as_tibble)) {
+ tab <- as.data.frame(tab)
+ }
+
+ tab
+}
+
+#' @rdname read_delim_arrow
+#' @export
+read_csv_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ # col_names = TRUE,
+ # col_types = TRUE,
+ col_select = NULL,
+ # na = c("", "NA"),
+ # quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ # skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = csv_read_options(),
+ as_tibble = TRUE) {
+
+ mc <- match.call()
+ mc$delim <- ","
+ mc[[1]] <- as.name("read_delim_arrow")
+ eval.parent(mc)
+}
+
+#' @rdname read_delim_arrow
+#' @export
+read_tsv_arrow <- function(file,
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ # col_names = TRUE,
+ # col_types = TRUE,
+ col_select = NULL,
+ # na = c("", "NA"),
+ # quoted_na = TRUE,
+ skip_empty_rows = TRUE,
+ # skip = 0L,
+ parse_options = NULL,
+ convert_options = NULL,
+ read_options = csv_read_options(),
+ as_tibble = TRUE) {
+
+ mc <- match.call()
+ mc$delim <- "\t"
+ mc[[1]] <- as.name("read_delim_arrow")
+ eval.parent(mc)
+}
+
#' @include R6.R
`arrow::csv::TableReader` <- R6Class("arrow::csv::TableReader", inherit = `arrow::Object`,
@@ -41,7 +203,29 @@ csv_read_options <- function(block_size = 1048576L) {
))
}
-#' Parsing options
+readr_to_csv_parse_options <- function(delim = ",",
+ quote = '"',
+ escape_double = TRUE,
+ escape_backslash = FALSE,
+ skip_empty_rows = TRUE,
+ skip = 0L) {
+ # This function translates from the readr argument list to the arrow arg names
+ # TODO: validate inputs
+ csv_parse_options(
+ delimiter = delim,
+ quoting = nzchar(quote),
+ quote_char = quote,
+ double_quote = escape_double,
+ escaping = escape_backslash,
+ escape_char = '\\',
+ newlines_in_values = escape_backslash,
+ ignore_empty_lines = skip_empty_rows,
+ header_rows = skip
+ )
+}
+
+#' CSV parsing options
+#'
#'
#' @param delimiter Field delimiter
#' @param quoting Whether quoting is used
@@ -54,12 +238,16 @@ csv_read_options <- function(block_size = 1048576L) {
#' @param header_rows Number of header rows to skip (including the first row containing column names)
#'
#' @export
-csv_parse_options <- function(
- delimiter = ",", quoting = TRUE, quote_char = '"',
- double_quote = TRUE, escaping = FALSE, escape_char = '\\',
- newlines_in_values = FALSE, ignore_empty_lines = TRUE,
- header_rows = 1L
-){
+csv_parse_options <- function(delimiter = ",",
+ quoting = TRUE,
+ quote_char = '"',
+ double_quote = TRUE,
+ escaping = FALSE,
+ escape_char = '\\',
+ newlines_in_values = FALSE,
+ ignore_empty_lines = TRUE,
+ header_rows = 1L) {
+
shared_ptr(`arrow::csv::ParseOptions`, csv___ParseOptions__initialize(
list(
delimiter = delimiter,
@@ -80,7 +268,20 @@ csv_parse_options <- function(
#' @param check_utf8 Whether to check UTF8 validity of string columns
#'
#' @export
-csv_convert_options <- function(check_utf8 = TRUE){
+csv_convert_options <- function(check_utf8 = TRUE) {
+ # TODO: there are more conversion options available:
+ # // Optional per-column types (disabling type inference on those columns)
+ # std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+ # // Recognized spellings for null values
+ # std::vector<std::string> null_values;
+ # // Recognized spellings for boolean values
+ # std::vector<std::string> true_values;
+ # std::vector<std::string> false_values;
+ # // Whether string / binary columns can have null values.
+ # // If true, then strings in "null_values" are considered null for string columns.
+ # // If false, then all strings are valid string values.
+ # bool strings_can_be_null = false;
+
shared_ptr(`arrow::csv::ConvertOptions`, csv___ConvertOptions__initialize(
list(
check_utf8 = check_utf8
@@ -88,14 +289,20 @@ csv_convert_options <- function(check_utf8 = TRUE){
))
}
-#' CSV table reader
+#' Arrow CSV table reader
+#'
+#' These methods wrap the Arrow C++ CSV table reader.
+#' For an interface to the CSV reader that's more familiar for R users, see
+#' [read_csv_arrow()]
#'
-#' @param file file
+#' @param file A character path to a local file, or an Arrow input stream
#' @param read_options, see [csv_read_options()]
#' @param parse_options, see [csv_parse_options()]
#' @param convert_options, see [csv_convert_options()]
#' @param ... additional parameters.
#'
+#' @return An `arrow::csv::TableReader` R6 object. Call `$Read()` on it to get
+#' an Arrow Table.
#' @export
csv_table_reader <- function(file,
read_options = csv_read_options(),
@@ -167,35 +374,3 @@ csv_table_reader.default <- function(file,
){
file
}
-
-#' Read csv file into an arrow::Table
-#'
-#' Use arrow::csv::TableReader from [csv_table_reader()]
-#'
-#' @inheritParams csv_table_reader
-#'
-#' @param col_select [tidy selection specification][tidyselect::vars_select] of columns
-#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a data frame.
-#'
-#' @export
-read_csv_arrow <- function(file,
- read_options = csv_read_options(),
- parse_options = csv_parse_options(),
- convert_options = csv_convert_options(),
- col_select = NULL,
- as_tibble = TRUE
- )
-{
- reader <- csv_table_reader(file,
- read_options = read_options,
- parse_options = parse_options,
- convert_options = convert_options)
-
- tab <- reader$Read()$select(!!enquo(col_select))
-
- if (isTRUE(as_tibble)) {
- tab <- as.data.frame(tab)
- }
-
- tab
-}
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 69c02e0..648085b 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -39,17 +39,28 @@ navbar:
text: Reference
href: reference/index.html
reference:
+- title: Installation helpers
+ contents:
+ - arrow_available
+ - install_arrow
- title: Reading and writing files
contents:
- - read_csv_arrow
+ - read_delim_arrow
+ - read_json_arrow
- read_feather
- read_parquet
- write_arrow
- write_feather
+ - write_parquet
- csv_convert_options
- csv_parse_options
- csv_read_options
- csv_table_reader
+ - json_parse_options
+ - json_read_options
+ - parquet_arrow_reader_properties
+ - json_table_reader
+ - parquet_file_reader
- title: Arrow data containers
contents:
- buffer
@@ -92,6 +103,7 @@ reference:
- arrow__io__RandomAccessFile
- arrow__io__Readable
- arrow__io__ReadableFile
+ - arrow__json__TableReader
- arrow__ipc__Message
- arrow__ipc__MessageReader
- arrow__ipc__RecordBatchFileReader
@@ -117,7 +129,6 @@ reference:
- RecordBatchFileWriter
- RecordBatchStreamReader
- RecordBatchStreamWriter
- - threadpool
- cast_options
- compression_codec
- default_memory_pool
diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd
index 1f4b5fb..c3da92d 100644
--- a/r/man/arrow-package.Rd
+++ b/r/man/arrow-package.Rd
@@ -15,7 +15,6 @@
\seealso{
Useful links:
\itemize{
- \item \url{https://arrow.apache.org/docs/r/}
\item \url{https://github.com/apache/arrow/}
\item Report bugs at \url{https://issues.apache.org/jira/projects/ARROW/issues}
}
diff --git a/r/man/csv_parse_options.Rd b/r/man/csv_parse_options.Rd
index 7e6ab77..ac98262 100644
--- a/r/man/csv_parse_options.Rd
+++ b/r/man/csv_parse_options.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/csv.R
\name{csv_parse_options}
\alias{csv_parse_options}
-\title{Parsing options}
+\title{CSV parsing options}
\usage{
csv_parse_options(delimiter = ",", quoting = TRUE,
quote_char = "\\"", double_quote = TRUE, escaping = FALSE,
@@ -29,5 +29,5 @@ csv_parse_options(delimiter = ",", quoting = TRUE,
\item{header_rows}{Number of header rows to skip (including the first row containing column names)}
}
\description{
-Parsing options
+CSV parsing options
}
diff --git a/r/man/csv_table_reader.Rd b/r/man/csv_table_reader.Rd
index 029cd0b..862aefb 100644
--- a/r/man/csv_table_reader.Rd
+++ b/r/man/csv_table_reader.Rd
@@ -2,14 +2,14 @@
% Please edit documentation in R/csv.R
\name{csv_table_reader}
\alias{csv_table_reader}
-\title{CSV table reader}
+\title{Arrow CSV table reader}
\usage{
csv_table_reader(file, read_options = csv_read_options(),
parse_options = csv_parse_options(),
convert_options = csv_convert_options(), ...)
}
\arguments{
-\item{file}{file}
+\item{file}{A character path to a local file, or an Arrow input stream}
\item{read_options, }{see \code{\link[=csv_read_options]{csv_read_options()}}}
@@ -19,6 +19,12 @@ csv_table_reader(file, read_options = csv_read_options(),
\item{...}{additional parameters.}
}
+\value{
+An \code{arrow::csv::TableReader} R6 object. Call \code{$Read()} on it to get
+an Arrow Table.
+}
\description{
-CSV table reader
+These methods wrap the Arrow C++ CSV table reader.
+For an interface to the CSV reader that's more familiar for R users, see
+\code{\link[=read_csv_arrow]{read_csv_arrow()}}
}
diff --git a/r/man/read_csv_arrow.Rd b/r/man/read_csv_arrow.Rd
deleted file mode 100644
index 47e5158..0000000
--- a/r/man/read_csv_arrow.Rd
+++ /dev/null
@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/csv.R
-\name{read_csv_arrow}
-\alias{read_csv_arrow}
-\title{Read csv file into an arrow::Table}
-\usage{
-read_csv_arrow(file, read_options = csv_read_options(),
- parse_options = csv_parse_options(),
- convert_options = csv_convert_options(), col_select = NULL,
- as_tibble = TRUE)
-}
-\arguments{
-\item{file}{file}
-
-\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}}
-
-\item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}}
-
-\item{convert_options}{see \code{\link[=csv_convert_options]{csv_convert_options()}}}
-
-\item{col_select}{\link[tidyselect:vars_select]{tidy selection specification} of columns}
-
-\item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a data frame.}
-}
-\description{
-Use arrow::csv::TableReader from \code{\link[=csv_table_reader]{csv_table_reader()}}
-}
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
new file mode 100644
index 0000000..e1ca16f
--- /dev/null
+++ b/r/man/read_delim_arrow.Rd
@@ -0,0 +1,79 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/csv.R
+\name{read_delim_arrow}
+\alias{read_delim_arrow}
+\alias{read_csv_arrow}
+\alias{read_tsv_arrow}
+\title{Read a CSV or other delimited file with Arrow}
+\usage{
+read_delim_arrow(file, delim = ",", quote = "\\"",
+ escape_double = TRUE, escape_backslash = FALSE, col_select = NULL,
+ skip_empty_rows = TRUE, parse_options = NULL,
+ convert_options = NULL, read_options = csv_read_options(),
+ as_tibble = TRUE)
+
+read_csv_arrow(file, quote = "\\"", escape_double = TRUE,
+ escape_backslash = FALSE, col_select = NULL,
+ skip_empty_rows = TRUE, parse_options = NULL,
+ convert_options = NULL, read_options = csv_read_options(),
+ as_tibble = TRUE)
+
+read_tsv_arrow(file, quote = "\\"", escape_double = TRUE,
+ escape_backslash = FALSE, col_select = NULL,
+ skip_empty_rows = TRUE, parse_options = NULL,
+ convert_options = NULL, read_options = csv_read_options(),
+ as_tibble = TRUE)
+}
+\arguments{
+\item{file}{A character path to a local file, or an Arrow input stream}
+
+\item{delim}{Single character used to separate fields within a record.}
+
+\item{quote}{Single character used to quote strings.}
+
+\item{escape_double}{Does the file escape quotes by doubling them?
+i.e. If this option is \code{TRUE}, the value \code{""""} represents
+a single quote, \code{\"}.}
+
+\item{escape_backslash}{Does the file use backslashes to escape special
+characters? This is more general than \code{escape_double} as backslashes
+can be used to escape the delimiter character, the quote character, or
+to add special characters like \code{\\n}.}
+
+\item{col_select}{A \link[tidyselect:vars_select]{tidy selection specification}
+of columns, as used in \code{dplyr::select()}.}
+
+\item{skip_empty_rows}{Should blank rows be ignored altogether? If
+\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be
+filled with missings.}
+
+\item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}. If given, this overrides any
+parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, etc.).}
+
+\item{convert_options}{see \code{\link[=csv_convert_options]{csv_convert_options()}}}
+
+\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}}
+
+\item{as_tibble}{Should the function return a \code{data.frame} or an
+\link[=arrow__Table]{arrow::Table}?}
+}
+\value{
+A \code{data.frame}, or an \code{arrow::Table} if \code{as_tibble = FALSE}.
+}
+\description{
+These functions uses the Arrow C++ CSV reader to read into a \code{data.frame}.
+Arrow C++ options have been mapped to argument names that follow those of
+\code{\link[readr:read_delim]{readr::read_delim()}}, and \code{col_select} was inspired by \code{\link[vroom:vroom]{vroom::vroom()}}.
+}
+\details{
+\code{read_csv_arrow()} and \code{read_tsv_arrow()} are wrappers around
+\code{read_delim_arrow()} that specify a delimiter.
+
+Note that not all \code{readr} options are currently implemented here. Please file
+an issue if you encounter one that \code{arrow} should support.
+
+If you need to control Arrow-specific reader parameters that don't have an
+equivalent in \code{readr::read_csv()}, you can either provide them in the
+\code{parse_options}, \code{convert_options}, or \code{read_options} arguments, or you can
+call \code{\link[=csv_table_reader]{csv_table_reader()}} directly for lower-level access.
+}
diff --git a/r/tests/testthat/test-arrow-csv.R b/r/tests/testthat/test-arrow-csv.R
index 7f0c1ae..aed9638 100644
--- a/r/tests/testthat/test-arrow-csv.R
+++ b/r/tests/testthat/test-arrow-csv.R
@@ -19,8 +19,9 @@ context("arrow::csv::TableReader")
test_that("Can read csv file", {
tf <- tempfile()
+ on.exit(unlink(tf))
- write.csv(iris, tf, row.names = FALSE, quote = FALSE)
+ write.csv(iris, tf, row.names = FALSE)
tab1 <- read_csv_arrow(tf, as_tibble = FALSE)
tab2 <- read_csv_arrow(mmap_open(tf), as_tibble = FALSE)
@@ -31,14 +32,13 @@ test_that("Can read csv file", {
expect_equal(tab0, tab1)
expect_equal(tab0, tab2)
expect_equal(tab0, tab3)
-
- unlink(tf)
})
test_that("read_csv_arrow(as_tibble=TRUE)", {
tf <- tempfile()
+ on.exit(unlink(tf))
- write.csv(iris, tf, row.names = FALSE, quote = FALSE)
+ write.csv(iris, tf, row.names = FALSE)
tab1 <- read_csv_arrow(tf, as_tibble = TRUE)
tab2 <- read_csv_arrow(mmap_open(tf), as_tibble = TRUE)
@@ -48,12 +48,87 @@ test_that("read_csv_arrow(as_tibble=TRUE)", {
expect_equivalent(iris, tab1)
expect_equivalent(iris, tab2)
expect_equivalent(iris, tab3)
+})
+
+test_that("read_delim_arrow parsing options: delim", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.table(iris, tf, sep = "\t", row.names = FALSE)
+ tab1 <- read_tsv_arrow(tf)
+ tab2 <- read_delim_arrow(tf, delim = "\t")
+ expect_equivalent(tab1, tab2)
+
+ iris$Species <- as.character(iris$Species)
+ expect_equivalent(iris, tab1)
+})
- unlink(tf)
+test_that("read_delim_arrow parsing options: quote", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ df <- data.frame(a=c(1, 2), b=c("'abc'", "'def'"))
+ write.table(df, sep=";", tf, row.names = FALSE, quote = FALSE)
+ tab1 <- read_delim_arrow(tf, delim = ";", quote = "'")
+
+ # Is this a problem?
+ # Component “a”: target is integer64, current is numeric
+ tab1$a <- as.numeric(tab1$a)
+ expect_equivalent(
+ tab1,
+ data.frame(a=c(1, 2), b=c("abc", "def"), stringsAsFactors = FALSE)
+ )
})
+test_that("read_csv_arrow parsing options: col_names", {
+ skip("Invalid: Empty CSV file")
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.table(iris, tf, sep = ",", row.names = FALSE, col.names = FALSE)
+ tab1 <- read_csv_arrow(tf, col_names = FALSE)
+
+ expect_identical(names(tab1), names(iris))
+ iris$Species <- as.character(iris$Species)
+ expect_equivalent(iris, tab1)
+})
+
+test_that("read_csv_arrow parsing options: skip", {
+ skip("Invalid: Empty CSV file")
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ cat("asdf\nqwer\n", file = tf)
+ suppressWarnings(write.table(iris, tf, sep = ",", row.names = FALSE, append = TRUE))
+ # This works:
+ # print(head(readr::read_csv(tf, skip = 2)))
+
+ # This errors:
+ tab1 <- read_csv_arrow(tf, skip = 2)
+
+ expect_identical(names(tab1), names(iris))
+ iris$Species <- as.character(iris$Species)
+ expect_equivalent(iris, tab1)
+})
+
+test_that("read_csv_arrow parsing options: skip_empty_rows", {
+ skip("Invalid: Empty CSV file")
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ write.csv(iris, tf, row.names = FALSE)
+ cat("\n\n", file = tf, append = TRUE)
+
+ tab1 <- read_csv_arrow(tf, skip_empty_rows = FALSE)
+
+ expect_equal(nrow(tab1), nrow(iris) + 2)
+ expect_true(is.na(tail(iris, 1)[[1]]))
+})
+
+
test_that("read_csv_arrow() respects col_select", {
tf <- tempfile()
+ on.exit(unlink(tf))
write.csv(iris, tf, row.names = FALSE, quote = FALSE)
@@ -62,6 +137,4 @@ test_that("read_csv_arrow() respects col_select", {
tib <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_tibble = TRUE)
expect_equal(tib, tibble::tibble(Sepal.Length = iris$Sepal.Length, Sepal.Width = iris$Sepal.Width))
-
- unlink(tf)
})