You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/06/25 18:20:32 UTC
[arrow] branch master updated: ARROW-5492: [R] Add "col_select"
argument to read_* functions to read subset of columns
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2b5fbed ARROW-5492: [R] Add "col_select" argument to read_* functions to read subset of columns
2b5fbed is described below
commit 2b5fbed581fc97f928697adcd2d09ed9a80a631a
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Tue Jun 25 20:20:15 2019 +0200
ARROW-5492: [R] Add "col_select" argument to read_* functions to read subset of columns
``` r
library(arrow, warn.conflicts = FALSE)
library(tibble)
tf <- tempfile()
write.csv(iris, tf, row.names = FALSE, quote = FALSE)
read_csv_arrow(tf, col_select = starts_with("Sepal"))
#> # A tibble: 150 x 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 5.1 3.5
#> 2 4.9 3
#> 3 4.7 3.2
#> 4 4.6 3.1
#> 5 5 3.6
#> 6 5.4 3.9
#> 7 4.6 3.4
#> 8 5 3.4
#> 9 4.4 2.9
#> 10 4.9 3.1
#> # … with 140 more rows
tf <- tempfile()
write_feather(iris, tf)
read_feather(tf, col_select = contains("."))
#> # A tibble: 150 x 4
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <dbl> <dbl> <dbl> <dbl>
#> 1 5.1 3.5 1.4 0.2
#> 2 4.9 3 1.4 0.2
#> 3 4.7 3.2 1.3 0.2
#> 4 4.6 3.1 1.5 0.2
#> 5 5 3.6 1.4 0.2
#> 6 5.4 3.9 1.7 0.4
#> 7 4.6 3.4 1.4 0.3
#> 8 5 3.4 1.5 0.2
#> 9 4.4 2.9 1.4 0.2
#> 10 4.9 3.1 1.5 0.1
#> # … with 140 more rows
```
<sup>Created on 2019-06-20 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000)</sup>
Author: Romain Francois <ro...@rstudio.com>
Author: Romain François <ro...@rstudio.com>
Closes #4627 from romainfrancois/ARROW-5492/tidyselect and squashes the following commits:
fe5906ceb <Romain François> Merge branch 'master' into ARROW-5492/tidyselect
89bcfdd11 <Romain Francois> always mmap
48fc793cc <Romain Francois> remmove ARROW_R_WITH_PARQUET
e7403c533 <Romain Francois> use remotes::install_github() instead of pak::pkg_install()
b73be3651 <Romain Francois> merge hickup
1e40a0da0 <Romain Francois> read_json_arrow(col_select=)
e112ff8c5 <Romain Francois> support for col_select=<tidy selection> in read_parquet()
21561b17b <Romain Francois> FileReader$GetSchema()
f6d824cf7 <Romain Francois> s/ParquetFileReader/FileReader/
f5bb537b0 <Romain Francois> expose parquet::arrow::ArrowReaderProperties
512579e06 <Romain Francois> expose ParquetFileReader at the R side
c44b9cb98 <Romain Francois> more read_csv_arrow(col_select=) tests
4d5515f8a <Romain Francois> read_csv_arrow(col_select=, as_tibble=)
07bcddaf6 <Romain Francois> Table$select(<tidy selection>)
826cd3aab <Romain Francois> tests for read_feather(col_select=<tidyselect helper>)
54b179fcf <Romain Francois> re exporting tidyselect helpers for convenience
e31098a4a <Romain Francois> read_feather(col_select=) using tidy selection, as in vroom::vroom
---
r/DESCRIPTION | 4 +-
r/NAMESPACE | 25 +++
r/R/Table.R | 13 ++
r/R/arrow-package.R | 3 +-
r/R/arrowExports.R | 52 ++++-
r/R/csv.R | 30 ++-
r/R/feather.R | 14 +-
r/R/json.R | 6 +-
r/R/parquet.R | 99 +++++++++-
.../test-parquet.R => R/reexports-tidyselect.R} | 46 +++--
r/configure | 2 +-
r/configure.win | 2 +-
r/data-raw/codegen.R | 6 +-
r/man/parquet_arrow_reader_properties.Rd | 14 ++
r/man/parquet_file_reader.Rd | 18 ++
r/man/read_csv_arrow.Rd | 17 +-
r/man/read_feather.Rd | 4 +-
r/man/read_json_arrow.Rd | 4 +-
r/man/read_parquet.Rd | 11 +-
r/man/reexports.Rd | 13 +-
r/src/arrowExports.cpp | 217 ++++++++++++++++++++-
r/src/arrow_types.h | 3 +
r/src/feather.cpp | 21 +-
r/src/parquet.cpp | 89 +++++++--
r/src/table.cpp | 28 +++
r/tests/testthat/test-arrow-csv.R | 37 +++-
r/tests/testthat/test-feather.R | 30 ++-
r/tests/testthat/test-json.R | 16 ++
r/tests/testthat/test-parquet.R | 8 +
29 files changed, 742 insertions(+), 90 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 70a6654..45edda1 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -31,7 +31,8 @@ Imports:
assertthat,
R6,
fs,
- bit64
+ bit64,
+ tidyselect
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1
Suggests:
@@ -76,4 +77,5 @@ Collate:
'read_record_batch.R'
'read_table.R'
'reexports-bit64.R'
+ 'reexports-tidyselect.R'
'write_arrow.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 66f2004..a64c5cb 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -61,6 +61,9 @@ S3method(json_table_reader,default)
S3method(json_table_reader,fs_path)
S3method(length,"arrow::Array")
S3method(names,"arrow::RecordBatch")
+S3method(parquet_file_reader,"arrow::io::RandomAccessFile")
+S3method(parquet_file_reader,character)
+S3method(parquet_file_reader,fs_path)
S3method(print,"arrow-enum")
S3method(read_message,"arrow::io::InputStream")
S3method(read_message,"arrow::ipc::MessageReader")
@@ -120,6 +123,7 @@ export(buffer)
export(cast_options)
export(chunked_array)
export(compression_codec)
+export(contains)
export(csv_convert_options)
export(csv_parse_options)
export(csv_read_options)
@@ -129,6 +133,8 @@ export(date64)
export(decimal)
export(default_memory_pool)
export(dictionary)
+export(ends_with)
+export(everything)
export(field)
export(float16)
export(float32)
@@ -141,10 +147,16 @@ export(int8)
export(json_parse_options)
export(json_read_options)
export(json_table_reader)
+export(last_col)
export(list_of)
+export(matches)
export(mmap_create)
export(mmap_open)
export(null)
+export(num_range)
+export(one_of)
+export(parquet_arrow_reader_properties)
+export(parquet_file_reader)
export(read_arrow)
export(read_csv_arrow)
export(read_feather)
@@ -156,6 +168,7 @@ export(read_schema)
export(read_table)
export(record_batch)
export(schema)
+export(starts_with)
export(struct)
export(table)
export(time32)
@@ -182,8 +195,20 @@ importFrom(purrr,map_int)
importFrom(rlang,"%||%")
importFrom(rlang,abort)
importFrom(rlang,dots_n)
+importFrom(rlang,enquo)
+importFrom(rlang,enquos)
importFrom(rlang,is_false)
importFrom(rlang,list2)
+importFrom(rlang,quo_is_null)
importFrom(rlang,warn)
+importFrom(tidyselect,contains)
+importFrom(tidyselect,ends_with)
+importFrom(tidyselect,everything)
+importFrom(tidyselect,last_col)
+importFrom(tidyselect,matches)
+importFrom(tidyselect,num_range)
+importFrom(tidyselect,one_of)
+importFrom(tidyselect,starts_with)
+importFrom(tidyselect,vars_select)
importFrom(utils,packageVersion)
useDynLib(arrow, .registration = TRUE)
diff --git a/r/R/Table.R b/r/R/Table.R
index d1e4b18..1aec916 100644
--- a/r/R/Table.R
+++ b/r/R/Table.R
@@ -40,6 +40,19 @@
assert_that(inherits(options, "arrow::compute::CastOptions"))
assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas")
shared_ptr(`arrow::Table`, Table__cast(self, target_schema, options))
+ },
+
+ select = function(spec) {
+ spec <- enquo(spec)
+ if (quo_is_null(spec)) {
+ self
+ } else {
+ all_vars <- Table__column_names(self)
+ vars <- vars_select(all_vars, !!spec)
+ indices <- match(vars, all_vars)
+ shared_ptr(`arrow::Table`, Table__select(self, indices))
+ }
+
}
),
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 77b3a31..d21931c 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -18,8 +18,9 @@
#' @importFrom R6 R6Class
#' @importFrom purrr map map_int map2
#' @importFrom assertthat assert_that
-#' @importFrom rlang list2 %||% is_false abort dots_n warn
+#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos
#' @importFrom Rcpp sourceCpp
+#' @importFrom tidyselect vars_select
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
"_PACKAGE"
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index fa6cb13..06efab1 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -548,6 +548,10 @@ ipc___feather___TableReader__Open <- function(stream){
.Call(`_arrow_ipc___feather___TableReader__Open` , stream)
}
+ipc___feather___TableReader__column_names <- function(reader){
+ .Call(`_arrow_ipc___feather___TableReader__column_names` , reader)
+}
+
Field__initialize <- function(name, field, nullable){
.Call(`_arrow_Field__initialize` , name, field, nullable)
}
@@ -744,14 +748,50 @@ ipc___ReadMessage <- function(stream){
.Call(`_arrow_ipc___ReadMessage` , stream)
}
-read_parquet_file <- function(filename){
- .Call(`_arrow_read_parquet_file` , filename)
+parquet___arrow___ArrowReaderProperties__Make <- function(use_threads){
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__Make` , use_threads)
+}
+
+parquet___arrow___ArrowReaderProperties__set_use_threads <- function(properties, use_threads){
+ invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads` , properties, use_threads))
+}
+
+parquet___arrow___ArrowReaderProperties__get_use_threads <- function(properties, use_threads){
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads` , properties, use_threads)
+}
+
+parquet___arrow___ArrowReaderProperties__get_read_dictionary <- function(properties, column_index){
+ .Call(`_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary` , properties, column_index)
+}
+
+parquet___arrow___ArrowReaderProperties__set_read_dictionary <- function(properties, column_index, read_dict){
+ invisible(.Call(`_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary` , properties, column_index, read_dict))
+}
+
+parquet___arrow___FileReader__OpenFile <- function(file, props){
+ .Call(`_arrow_parquet___arrow___FileReader__OpenFile` , file, props)
+}
+
+parquet___arrow___FileReader__ReadTable1 <- function(reader){
+ .Call(`_arrow_parquet___arrow___FileReader__ReadTable1` , reader)
+}
+
+parquet___arrow___FileReader__ReadTable2 <- function(reader, column_indices){
+ .Call(`_arrow_parquet___arrow___FileReader__ReadTable2` , reader, column_indices)
}
write_parquet_file <- function(table, filename){
invisible(.Call(`_arrow_write_parquet_file` , table, filename))
}
+parquet___arrow___FileReader__GetSchema2 <- function(reader, indices){
+ .Call(`_arrow_parquet___arrow___FileReader__GetSchema2` , reader, indices)
+}
+
+parquet___arrow___FileReader__GetSchema1 <- function(reader){
+ .Call(`_arrow_parquet___arrow___FileReader__GetSchema1` , reader)
+}
+
RecordBatch__num_columns <- function(x){
.Call(`_arrow_RecordBatch__num_columns` , x)
}
@@ -900,6 +940,14 @@ Table__columns <- function(table){
.Call(`_arrow_Table__columns` , table)
}
+Table__column_names <- function(table){
+ .Call(`_arrow_Table__column_names` , table)
+}
+
+Table__select <- function(table, indices){
+ .Call(`_arrow_Table__select` , table, indices)
+}
+
Table__from_dots <- function(lst, schema_sxp){
.Call(`_arrow_Table__from_dots` , lst, schema_sxp)
}
diff --git a/r/R/csv.R b/r/R/csv.R
index d34ddcb..03a4b7d 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -138,7 +138,7 @@ csv_table_reader.default <- function(file,
convert_options = csv_convert_options(),
...
){
- csv_table_reader(ReadableFile(file),
+ csv_table_reader(mmap_open(file),
read_options = read_options,
parse_options = parse_options,
convert_options = convert_options,
@@ -172,8 +172,30 @@ csv_table_reader.default <- function(file,
#'
#' Use arrow::csv::TableReader from [csv_table_reader()]
#'
-#' @param ... Used to construct an arrow::csv::TableReader
+#' @inheritParams csv_table_reader
+#'
+#' @param col_select [tidy selection specification][tidyselect::vars_select] of columns
+#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a data frame.
+#'
#' @export
-read_csv_arrow <- function(...) {
- csv_table_reader(...)$Read()
+read_csv_arrow <- function(file,
+ read_options = csv_read_options(),
+ parse_options = csv_parse_options(),
+ convert_options = csv_convert_options(),
+ col_select = NULL,
+ as_tibble = TRUE
+ )
+{
+ reader <- csv_table_reader(file,
+ read_options = read_options,
+ parse_options = parse_options,
+ convert_options = convert_options)
+
+ tab <- reader$Read()$select(!!enquo(col_select))
+
+ if (isTRUE(as_tibble)) {
+ tab <- as.data.frame(tab)
+ }
+
+ tab
}
diff --git a/r/R/feather.R b/r/R/feather.R
index f197fd0..7877b4b 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -162,15 +162,23 @@ FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) {
#' Read a feather file
#'
#' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle
-#' @param columns names if the columns to read. The default `NULL` means all columns
+#' @param col_select [tidy selection][tidyselect::vars_select()] of columns to read.
#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble.
#' @param ... additional parameters
#'
#' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise
#'
#' @export
-read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){
- out <- FeatherTableReader(file, ...)$Read(columns)
+read_feather <- function(file, col_select = NULL, as_tibble = TRUE, ...){
+ reader <- FeatherTableReader(file, ...)
+
+ all_columns <- ipc___feather___TableReader__column_names(reader)
+ col_select <- enquo(col_select)
+ columns <- if (!quo_is_null(col_select)) {
+ vars_select(all_columns, !!col_select)
+ }
+
+ out <- reader$Read(columns)
if (isTRUE(as_tibble)) {
out <- as.data.frame(out)
}
diff --git a/r/R/json.R b/r/R/json.R
index e51051d..1409e31 100644
--- a/r/R/json.R
+++ b/r/R/json.R
@@ -148,10 +148,12 @@ json_table_reader.default <- function(file,
#'
#' @param ... Used to construct an arrow::json::TableReader
#' @param as_tibble convert the [arrow::Table][arrow__Table] to a data frame
+#' @param col_select [tidy selection][tidyselect::vars_select] of columns
#'
#' @export
-read_json_arrow <- function(..., as_tibble = TRUE) {
- tab <- json_table_reader(...)$Read()
+read_json_arrow <- function(..., as_tibble = TRUE, col_select = NULL) {
+ tab <- json_table_reader(...)$Read()$select(!!enquo(col_select))
+
if (isTRUE(as_tibble)) {
tab <- as.data.frame(tab)
}
diff --git a/r/R/parquet.R b/r/R/parquet.R
index b5e5884..fb7cc45 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -15,15 +15,102 @@
# specific language governing permissions and limitations
# under the License.
+#' @include R6.R
+
+`parquet::arrow::FileReader` <- R6Class("parquet::arrow::FileReader",
+ inherit = `arrow::Object`,
+ public = list(
+ ReadTable = function(col_select = NULL) {
+ col_select <- enquo(col_select)
+ if(quo_is_null(col_select)) {
+ shared_ptr(`arrow::Table`, parquet___arrow___FileReader__ReadTable1(self))
+ } else {
+ all_vars <- shared_ptr(`arrow::Schema`, parquet___arrow___FileReader__GetSchema1(self))$names
+ indices <- match(vars_select(all_vars, !!col_select), all_vars) - 1L
+ shared_ptr(`arrow::Table`, parquet___arrow___FileReader__ReadTable2(self, indices))
+ }
+ },
+ GetSchema = function(column_indices = NULL) {
+ if (is.null(column_indices)) {
+ shared_ptr(`arrow::Schema`, parquet___arrow___FileReader__GetSchema1(self))
+ } else {
+ shared_ptr(`arrow::Schema`, parquet___arrow___FileReader__GetSchema2(self, column_indices))
+ }
+
+ }
+ )
+)
+
+`parquet::arrow::ArrowReaderProperties` <- R6Class("parquet::arrow::ArrowReaderProperties",
+ inherit = `arrow::Object`,
+ public = list(
+ read_dictionary = function(column_index) {
+ parquet___arrow___ArrowReaderProperties__get_read_dictionary(self, column_index)
+ },
+ set_read_dictionary = function(column_index, read_dict) {
+ parquet___arrow___ArrowReaderProperties__set_read_dictionary(self, column_index, read_dict)
+ }
+ ),
+ active = list(
+ use_threads = function(use_threads) {
+ if(missing(use_threads)) {
+ parquet___arrow___ArrowReaderProperties__get_use_threads(self)
+ } else {
+ parquet___arrow___ArrowReaderProperties__set_use_threads(self, use_threads)
+ }
+ }
+ )
+)
+
+#' Create a new ArrowReaderProperties instance
+#'
+#' @param use_threads use threads ?
+#'
+#' @export
+parquet_arrow_reader_properties <- function(use_threads = TRUE) {
+ shared_ptr(`parquet::arrow::ArrowReaderProperties`, parquet___arrow___ArrowReaderProperties__Make(isTRUE(use_threads)))
+}
+
+#' Create a FileReader instance
+#'
+#' @param file file
+#' @param props reader file properties, as created by [parquet_arrow_reader_properties()]
+#'
+#' @param ... additional parameters
+#'
+#' @export
+parquet_file_reader <- function(file, props = parquet_arrow_reader_properties(), ...) {
+ UseMethod("parquet_file_reader")
+}
+
+#' @export
+`parquet_file_reader.arrow::io::RandomAccessFile` <- function(file, props = parquet_arrow_reader_properties(), ...) {
+ unique_ptr(`parquet::arrow::FileReader`, parquet___arrow___FileReader__OpenFile(file, props))
+}
+
+#' @export
+parquet_file_reader.fs_path <- function(file, props = parquet_arrow_reader_properties(), memory_map = TRUE, ...) {
+ if (isTRUE(memory_map)) {
+ parquet_file_reader(mmap_open(file), props = props, ...)
+ } else {
+ parquet_file_reader(ReadableFile(file), props = props, ...)
+ }
+}
+
+#' @export
+parquet_file_reader.character <- function(file, props = parquet_arrow_reader_properties(), memory_map = TRUE, ...) {
+ parquet_file_reader(fs::path_abs(file), props = parquet_arrow_reader_properties(), memory_map = memory_map, ...)
+}
+
#' Read Parquet file from disk
#'
#' '[Parquet](https://parquet.apache.org/)' is a columnar storage file format.
#' This function enables you to read Parquet files into R.
#'
-#' @param file a file path
+#' @inheritParams parquet_file_reader
#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a
#' tibble? Default is `TRUE`.
-#' @param ... Additional arguments, currently ignored
+#' @param col_select [tidy selection][tidyselect::vars_select] of columns to read
#'
#' @return A [arrow::Table][arrow__Table], or a `tbl_df` if `as_tibble` is
#' `TRUE`.
@@ -34,9 +121,11 @@
#' }
#'
#' @export
-read_parquet <- function(file, as_tibble = TRUE, ...) {
- tab <- shared_ptr(`arrow::Table`, read_parquet_file(file))
- if (isTRUE(as_tibble)) {
+read_parquet <- function(file, props = parquet_arrow_reader_properties(), as_tibble = TRUE, col_select = NULL, ...) {
+ reader <- parquet_file_reader(file, props = props, ...)
+ tab <- reader$ReadTable(!!enquo(col_select))
+
+ if (as_tibble) {
tab <- as.data.frame(tab)
}
tab
diff --git a/r/tests/testthat/test-parquet.R b/r/R/reexports-tidyselect.R
similarity index 55%
copy from r/tests/testthat/test-parquet.R
copy to r/R/reexports-tidyselect.R
index 554744e..2566207 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/R/reexports-tidyselect.R
@@ -15,23 +15,29 @@
# specific language governing permissions and limitations
# under the License.
-context("Parquet file reading/writing")
-
-pq_file <- system.file("v0.7.1.parquet", package="arrow")
-
-test_that("reading a known Parquet file to tibble", {
- df <- read_parquet(pq_file)
- expect_true(tibble::is_tibble(df))
- expect_identical(dim(df), c(10L, 11L))
- # TODO: assert more about the contents
-})
-
-test_that("simple int column roundtrip", {
- df <- tibble::tibble(x = 1:5)
- pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary
- on.exit(unlink(pq_tmp_file))
-
- write_parquet(df, pq_tmp_file)
- df_read <- read_parquet(pq_tmp_file)
- expect_identical(df, df_read)
-})
+# Alias required for help links in downstream packages
+#' @aliases select_helpers
+#' @importFrom tidyselect contains
+#' @export
+tidyselect::contains
+#' @importFrom tidyselect ends_with
+#' @export
+tidyselect::ends_with
+#' @importFrom tidyselect everything
+#' @export
+tidyselect::everything
+#' @importFrom tidyselect matches
+#' @export
+tidyselect::matches
+#' @importFrom tidyselect num_range
+#' @export
+tidyselect::num_range
+#' @importFrom tidyselect one_of
+#' @export
+tidyselect::one_of
+#' @importFrom tidyselect starts_with
+#' @export
+tidyselect::starts_with
+#' @importFrom tidyselect last_col
+#' @export
+tidyselect::last_col
diff --git a/r/configure b/r/configure
index d5a2688..da3d868 100755
--- a/r/configure
+++ b/r/configure
@@ -110,7 +110,7 @@ if [ $? -ne 0 ]; then
PKG_LIBS=""
PKG_CFLAGS=""
else
- PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET"
+ PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_ARROW"
echo "PKG_CFLAGS=$PKG_CFLAGS"
echo "PKG_LIBS=$PKG_LIBS"
fi
diff --git a/r/configure.win b/r/configure.win
index f3d7aad..5ad2a95 100644
--- a/r/configure.win
+++ b/r/configure.win
@@ -38,7 +38,7 @@ if [ $? -ne 0 ]; then
else
# Set the right flags to point to and enable arrow/parquet
RWINLIB="../windows/arrow-${VERSION}"
- PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_PARQUET -DARROW_R_WITH_ARROW"
+ PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC -DARROW_R_WITH_ARROW"
PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) '"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow -lthrift -lsnappy -lboost_regex-mt-s -lboost_filesystem-mt-s -lboost_system-mt-s -ldouble-conversion -lz -lws2_32"
fi
diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R
index 3e56e33..ba5d0cb 100644
--- a/r/data-raw/codegen.R
+++ b/r/data-raw/codegen.R
@@ -35,7 +35,7 @@
# #endif
suppressPackageStartupMessages({
- # pak::pkg_install("romainfrancois/decor")
+ # remotes::install_github("romainfrancois/decor")
library(decor)
library(dplyr)
@@ -43,6 +43,10 @@ suppressPackageStartupMessages({
library(glue)
})
+if (packageVersion("decor") < '0.0.0.9001') {
+ stop("more recent version of `decor` needed, please install with `remotes::install_github('romainfrancois/decor')`")
+}
+
decorations <- cpp_decorations() %>%
filter(decoration == "arrow::export") %>%
# the three lines below can be expressed with rap()
diff --git a/r/man/parquet_arrow_reader_properties.Rd b/r/man/parquet_arrow_reader_properties.Rd
new file mode 100644
index 0000000..6276d7f
--- /dev/null
+++ b/r/man/parquet_arrow_reader_properties.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\name{parquet_arrow_reader_properties}
+\alias{parquet_arrow_reader_properties}
+\title{Create a new ArrowReaderProperties instance}
+\usage{
+parquet_arrow_reader_properties(use_threads = TRUE)
+}
+\arguments{
+\item{use_threads}{use threads ?}
+}
+\description{
+Create a new ArrowReaderProperties instance
+}
diff --git a/r/man/parquet_file_reader.Rd b/r/man/parquet_file_reader.Rd
new file mode 100644
index 0000000..8cbe6b0
--- /dev/null
+++ b/r/man/parquet_file_reader.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\name{parquet_file_reader}
+\alias{parquet_file_reader}
+\title{Create a FileReader instance}
+\usage{
+parquet_file_reader(file, props = parquet_arrow_reader_properties(), ...)
+}
+\arguments{
+\item{file}{file}
+
+\item{props}{reader file properties, as created by \code{\link[=parquet_arrow_reader_properties]{parquet_arrow_reader_properties()}}}
+
+\item{...}{additional parameters}
+}
+\description{
+Create a FileReader instance
+}
diff --git a/r/man/read_csv_arrow.Rd b/r/man/read_csv_arrow.Rd
index 4cdca91..47e5158 100644
--- a/r/man/read_csv_arrow.Rd
+++ b/r/man/read_csv_arrow.Rd
@@ -4,10 +4,23 @@
\alias{read_csv_arrow}
\title{Read csv file into an arrow::Table}
\usage{
-read_csv_arrow(...)
+read_csv_arrow(file, read_options = csv_read_options(),
+ parse_options = csv_parse_options(),
+ convert_options = csv_convert_options(), col_select = NULL,
+ as_tibble = TRUE)
}
\arguments{
-\item{...}{Used to construct an arrow::csv::TableReader}
+\item{file}{file}
+
+\item{read_options}{see \code{\link[=csv_read_options]{csv_read_options()}}}
+
+\item{parse_options}{see \code{\link[=csv_parse_options]{csv_parse_options()}}}
+
+\item{convert_options}{see \code{\link[=csv_convert_options]{csv_convert_options()}}}
+
+\item{col_select}{\link[tidyselect:vars_select]{tidy selection specification} of columns}
+
+\item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a data frame.}
}
\description{
Use arrow::csv::TableReader from \code{\link[=csv_table_reader]{csv_table_reader()}}
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 31fd36a..0492cce 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -4,12 +4,12 @@
\alias{read_feather}
\title{Read a feather file}
\usage{
-read_feather(file, columns = NULL, as_tibble = TRUE, ...)
+read_feather(file, col_select = NULL, as_tibble = TRUE, ...)
}
\arguments{
\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle}
-\item{columns}{names if the columns to read. The default \code{NULL} means all columns}
+\item{col_select}{\link[tidyselect:vars_select]{tidy selection} of columns to read.}
\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.}
diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd
index aa98b69..75ccd69 100644
--- a/r/man/read_json_arrow.Rd
+++ b/r/man/read_json_arrow.Rd
@@ -4,12 +4,14 @@
\alias{read_json_arrow}
\title{Read json file into an arrow::Table}
\usage{
-read_json_arrow(..., as_tibble = TRUE)
+read_json_arrow(..., as_tibble = TRUE, col_select = NULL)
}
\arguments{
\item{...}{Used to construct an arrow::json::TableReader}
\item{as_tibble}{convert the \link[=arrow__Table]{arrow::Table} to a data frame}
+
+\item{col_select}{\link[tidyselect:vars_select]{tidy selection} of columns}
}
\description{
Use \link[=arrow__json__TableReader]{arrow::json::TableReader} from \code{\link[=json_table_reader]{json_table_reader()}}
diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd
index 3b1973b..145ef65 100644
--- a/r/man/read_parquet.Rd
+++ b/r/man/read_parquet.Rd
@@ -4,15 +4,20 @@
\alias{read_parquet}
\title{Read Parquet file from disk}
\usage{
-read_parquet(file, as_tibble = TRUE, ...)
+read_parquet(file, props = parquet_arrow_reader_properties(),
+ as_tibble = TRUE, col_select = NULL, ...)
}
\arguments{
-\item{file}{a file path}
+\item{file}{file}
+
+\item{props}{reader file properties, as created by \code{\link[=parquet_arrow_reader_properties]{parquet_arrow_reader_properties()}}}
\item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a
tibble? Default is \code{TRUE}.}
-\item{...}{Additional arguments, currently ignored}
+\item{col_select}{\link[tidyselect:vars_select]{tidy selection} of columns to read}
+
+\item{...}{additional parameters}
}
\value{
A \link[=arrow__Table]{arrow::Table}, or a \code{tbl_df} if \code{as_tibble} is
diff --git a/r/man/reexports.Rd b/r/man/reexports.Rd
index 6951fcf..3b1e527 100644
--- a/r/man/reexports.Rd
+++ b/r/man/reexports.Rd
@@ -1,10 +1,19 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/reexports-bit64.R
+% Please edit documentation in R/reexports-bit64.R, R/reexports-tidyselect.R
\docType{import}
\name{reexports}
\alias{reexports}
\alias{print.integer64}
\alias{str.integer64}
+\alias{contains}
+\alias{select_helpers}
+\alias{ends_with}
+\alias{everything}
+\alias{matches}
+\alias{num_range}
+\alias{one_of}
+\alias{starts_with}
+\alias{last_col}
\title{Objects exported from other packages}
\keyword{internal}
\description{
@@ -13,5 +22,7 @@ below to see their documentation.
\describe{
\item{bit64}{\code{\link[bit64]{print.integer64}}, \code{\link[bit64]{str.integer64}}}
+
+ \item{tidyselect}{\code{\link[tidyselect]{contains}}, \code{\link[tidyselect]{ends_with}}, \code{\link[tidyselect]{everything}}, \code{\link[tidyselect]{matches}}, \code{\link[tidyselect]{num_range}}, \code{\link[tidyselect]{one_of}}, \code{\link[tidyselect]{starts_with}}, \code{\link[tidyselect]{last_col}}}
}}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 49dbf78..d2962fa 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -2103,6 +2103,21 @@ RcppExport SEXP _arrow_ipc___feather___TableReader__Open(SEXP stream_sexp){
}
#endif
+// feather.cpp
+#if defined(ARROW_R_WITH_ARROW)
+Rcpp::CharacterVector ipc___feather___TableReader__column_names(const std::unique_ptr<arrow::ipc::feather::TableReader>& reader);
+RcppExport SEXP _arrow_ipc___feather___TableReader__column_names(SEXP reader_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::unique_ptr<arrow::ipc::feather::TableReader>&>::type reader(reader_sexp);
+ return Rcpp::wrap(ipc___feather___TableReader__column_names(reader));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ipc___feather___TableReader__column_names(SEXP reader_sexp){
+ Rf_error("Cannot call ipc___feather___TableReader__column_names(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
// field.cpp
#if defined(ARROW_R_WITH_ARROW)
std::shared_ptr<arrow::Field> Field__initialize(const std::string& name, const std::shared_ptr<arrow::DataType>& field, bool nullable);
@@ -2860,16 +2875,130 @@ RcppExport SEXP _arrow_ipc___ReadMessage(SEXP stream_sexp){
// parquet.cpp
#if defined(ARROW_R_WITH_ARROW)
-std::shared_ptr<arrow::Table> read_parquet_file(std::string filename);
-RcppExport SEXP _arrow_read_parquet_file(SEXP filename_sexp){
+std::shared_ptr<parquet::arrow::ArrowReaderProperties> parquet___arrow___ArrowReaderProperties__Make(bool use_threads);
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__Make(SEXP use_threads_sexp){
BEGIN_RCPP
- Rcpp::traits::input_parameter<std::string>::type filename(filename_sexp);
- return Rcpp::wrap(read_parquet_file(filename));
+ Rcpp::traits::input_parameter<bool>::type use_threads(use_threads_sexp);
+ return Rcpp::wrap(parquet___arrow___ArrowReaderProperties__Make(use_threads));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__Make(SEXP use_threads_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__Make(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void parquet___arrow___ArrowReaderProperties__set_use_threads(const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties, bool use_threads);
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<parquet::arrow::ArrowReaderProperties>&>::type properties(properties_sexp);
+ Rcpp::traits::input_parameter<bool>::type use_threads(use_threads_sexp);
+ parquet___arrow___ArrowReaderProperties__set_use_threads(properties, use_threads);
+ return R_NilValue;
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__set_use_threads(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool parquet___arrow___ArrowReaderProperties__get_use_threads(const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties, bool use_threads);
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<parquet::arrow::ArrowReaderProperties>&>::type properties(properties_sexp);
+ Rcpp::traits::input_parameter<bool>::type use_threads(use_threads_sexp);
+ return Rcpp::wrap(parquet___arrow___ArrowReaderProperties__get_use_threads(properties, use_threads));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_use_threads(SEXP properties_sexp, SEXP use_threads_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__get_use_threads(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+bool parquet___arrow___ArrowReaderProperties__get_read_dictionary(const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties, int column_index);
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<parquet::arrow::ArrowReaderProperties>&>::type properties(properties_sexp);
+ Rcpp::traits::input_parameter<int>::type column_index(column_index_sexp);
+ return Rcpp::wrap(parquet___arrow___ArrowReaderProperties__get_read_dictionary(properties, column_index));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__get_read_dictionary(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void parquet___arrow___ArrowReaderProperties__set_read_dictionary(const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties, int column_index, bool read_dict);
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp, SEXP read_dict_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<parquet::arrow::ArrowReaderProperties>&>::type properties(properties_sexp);
+ Rcpp::traits::input_parameter<int>::type column_index(column_index_sexp);
+ Rcpp::traits::input_parameter<bool>::type read_dict(read_dict_sexp);
+ parquet___arrow___ArrowReaderProperties__set_read_dictionary(properties, column_index, read_dict);
+ return R_NilValue;
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary(SEXP properties_sexp, SEXP column_index_sexp, SEXP read_dict_sexp){
+ Rf_error("Cannot call parquet___arrow___ArrowReaderProperties__set_read_dictionary(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::unique_ptr<parquet::arrow::FileReader> parquet___arrow___FileReader__OpenFile(const std::shared_ptr<arrow::io::RandomAccessFile>& file, const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& props);
+RcppExport SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, SEXP props_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::io::RandomAccessFile>&>::type file(file_sexp);
+ Rcpp::traits::input_parameter<const std::shared_ptr<parquet::arrow::ArrowReaderProperties>&>::type props(props_sexp);
+ return Rcpp::wrap(parquet___arrow___FileReader__OpenFile(file, props));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___FileReader__OpenFile(SEXP file_sexp, SEXP props_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__OpenFile(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable1(const std::unique_ptr<parquet::arrow::FileReader>& reader);
+RcppExport SEXP _arrow_parquet___arrow___FileReader__ReadTable1(SEXP reader_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::unique_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return Rcpp::wrap(parquet___arrow___FileReader__ReadTable1(reader));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___FileReader__ReadTable1(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadTable1(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable2(const std::unique_ptr<parquet::arrow::FileReader>& reader, const std::vector<int>& column_indices);
+RcppExport SEXP _arrow_parquet___arrow___FileReader__ReadTable2(SEXP reader_sexp, SEXP column_indices_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::unique_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ Rcpp::traits::input_parameter<const std::vector<int>&>::type column_indices(column_indices_sexp);
+ return Rcpp::wrap(parquet___arrow___FileReader__ReadTable2(reader, column_indices));
END_RCPP
}
#else
-RcppExport SEXP _arrow_read_parquet_file(SEXP filename_sexp){
- Rf_error("Cannot call read_parquet_file(). Please use arrow::install_arrow() to install required runtime libraries. ");
+RcppExport SEXP _arrow_parquet___arrow___FileReader__ReadTable2(SEXP reader_sexp, SEXP column_indices_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__ReadTable2(). Please use arrow::install_arrow() to install required runtime libraries. ");
}
#endif
@@ -2890,6 +3019,37 @@ RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){
}
#endif
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> parquet___arrow___FileReader__GetSchema2(const std::unique_ptr<parquet::arrow::FileReader>& reader, const std::vector<int>& indices);
+RcppExport SEXP _arrow_parquet___arrow___FileReader__GetSchema2(SEXP reader_sexp, SEXP indices_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::unique_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ Rcpp::traits::input_parameter<const std::vector<int>&>::type indices(indices_sexp);
+ return Rcpp::wrap(parquet___arrow___FileReader__GetSchema2(reader, indices));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___FileReader__GetSchema2(SEXP reader_sexp, SEXP indices_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__GetSchema2(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Schema> parquet___arrow___FileReader__GetSchema1(const std::unique_ptr<parquet::arrow::FileReader>& reader);
+RcppExport SEXP _arrow_parquet___arrow___FileReader__GetSchema1(SEXP reader_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::unique_ptr<parquet::arrow::FileReader>&>::type reader(reader_sexp);
+ return Rcpp::wrap(parquet___arrow___FileReader__GetSchema1(reader));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_parquet___arrow___FileReader__GetSchema1(SEXP reader_sexp){
+ Rf_error("Cannot call parquet___arrow___FileReader__GetSchema1(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
// recordbatch.cpp
#if defined(ARROW_R_WITH_ARROW)
int RecordBatch__num_columns(const std::shared_ptr<arrow::RecordBatch>& x);
@@ -3465,6 +3625,37 @@ RcppExport SEXP _arrow_Table__columns(SEXP table_sexp){
// table.cpp
#if defined(ARROW_R_WITH_ARROW)
+Rcpp::CharacterVector Table__column_names(const std::shared_ptr<arrow::Table>& table);
+RcppExport SEXP _arrow_Table__column_names(SEXP table_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ return Rcpp::wrap(Table__column_names(table));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_Table__column_names(SEXP table_sexp){
+ Rf_error("Cannot call Table__column_names(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Table> Table__select(const std::shared_ptr<arrow::Table>& table, const Rcpp::IntegerVector& indices);
+RcppExport SEXP _arrow_Table__select(SEXP table_sexp, SEXP indices_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ Rcpp::traits::input_parameter<const Rcpp::IntegerVector&>::type indices(indices_sexp);
+ return Rcpp::wrap(Table__select(table, indices));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_Table__select(SEXP table_sexp, SEXP indices_sexp){
+ Rf_error("Cannot call Table__select(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// table.cpp
+#if defined(ARROW_R_WITH_ARROW)
std::shared_ptr<arrow::Table> Table__from_dots(SEXP lst, SEXP schema_sxp);
RcppExport SEXP _arrow_Table__from_dots(SEXP lst_sexp, SEXP schema_sxp_sexp){
BEGIN_RCPP
@@ -3659,6 +3850,7 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_ipc___feather___TableReader__GetColumn", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumn, 2},
{ "_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 2},
{ "_arrow_ipc___feather___TableReader__Open", (DL_FUNC) &_arrow_ipc___feather___TableReader__Open, 1},
+ { "_arrow_ipc___feather___TableReader__column_names", (DL_FUNC) &_arrow_ipc___feather___TableReader__column_names, 1},
{ "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3},
{ "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1},
{ "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1},
@@ -3708,8 +3900,17 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_ipc___MessageReader__Open", (DL_FUNC) &_arrow_ipc___MessageReader__Open, 1},
{ "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1},
{ "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1},
- { "_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__Make", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__Make, 1},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_use_threads, 2},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_use_threads, 2},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__get_read_dictionary, 2},
+ { "_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary", (DL_FUNC) &_arrow_parquet___arrow___ArrowReaderProperties__set_read_dictionary, 3},
+ { "_arrow_parquet___arrow___FileReader__OpenFile", (DL_FUNC) &_arrow_parquet___arrow___FileReader__OpenFile, 2},
+ { "_arrow_parquet___arrow___FileReader__ReadTable1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable1, 1},
+ { "_arrow_parquet___arrow___FileReader__ReadTable2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__ReadTable2, 2},
{ "_arrow_write_parquet_file", (DL_FUNC) &_arrow_write_parquet_file, 2},
+ { "_arrow_parquet___arrow___FileReader__GetSchema2", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema2, 2},
+ { "_arrow_parquet___arrow___FileReader__GetSchema1", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema1, 1},
{ "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1},
{ "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1},
{ "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1},
@@ -3747,6 +3948,8 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1},
{ "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2},
{ "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1},
+ { "_arrow_Table__column_names", (DL_FUNC) &_arrow_Table__column_names, 1},
+ { "_arrow_Table__select", (DL_FUNC) &_arrow_Table__select, 2},
{ "_arrow_Table__from_dots", (DL_FUNC) &_arrow_Table__from_dots, 2},
{ "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0},
{ "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1},
diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h
index c93d448..3ff1a3d 100644
--- a/r/src/arrow_types.h
+++ b/r/src/arrow_types.h
@@ -176,6 +176,9 @@ inline std::shared_ptr<T> extract(SEXP x) {
#include <arrow/json/reader.h>
#include <arrow/type.h>
#include <arrow/util/compression.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
RCPP_EXPOSED_ENUM_NODECL(arrow::Type::type)
RCPP_EXPOSED_ENUM_NODECL(arrow::DateUnit)
diff --git a/r/src/feather.cpp b/r/src/feather.cpp
index 2836ba8..a519881 100644
--- a/r/src/feather.cpp
+++ b/r/src/feather.cpp
@@ -119,16 +119,6 @@ std::shared_ptr<arrow::Table> ipc___feather___TableReader__Read(
std::shared_ptr<arrow::Table> table;
switch (TYPEOF(columns)) {
- case INTSXP: {
- R_xlen_t n = XLENGTH(columns);
- std::vector<int> indices(n);
- int* p_columns = INTEGER(columns);
- for (int i = 0; i < n; i++) {
- indices[i] = p_columns[i] - 1;
- }
- STOP_IF_NOT_OK(reader->Read(indices, &table));
- break;
- }
case STRSXP: {
R_xlen_t n = XLENGTH(columns);
std::vector<std::string> names(n);
@@ -157,4 +147,15 @@ std::unique_ptr<arrow::ipc::feather::TableReader> ipc___feather___TableReader__O
return reader;
}
+// [[arrow::export]]
+Rcpp::CharacterVector ipc___feather___TableReader__column_names(
+ const std::unique_ptr<arrow::ipc::feather::TableReader>& reader) {
+ int64_t n = reader->num_columns();
+ Rcpp::CharacterVector out(n);
+ for (int64_t i = 0; i < n; i++) {
+ out[i] = reader->GetColumnName(i);
+ }
+ return out;
+}
+
#endif
diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp
index a89801e..9ad1438 100644
--- a/r/src/parquet.cpp
+++ b/r/src/parquet.cpp
@@ -19,44 +19,101 @@
#if defined(ARROW_R_WITH_ARROW)
-#ifdef ARROW_R_WITH_PARQUET
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>
-#endif
// [[arrow::export]]
-std::shared_ptr<arrow::Table> read_parquet_file(std::string filename) {
-#ifdef ARROW_R_WITH_PARQUET
- std::shared_ptr<arrow::io::ReadableFile> infile;
- PARQUET_THROW_NOT_OK(
- arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool(), &infile));
+std::shared_ptr<parquet::arrow::ArrowReaderProperties>
+parquet___arrow___ArrowReaderProperties__Make(bool use_threads) {
+ return std::make_shared<parquet::arrow::ArrowReaderProperties>(use_threads);
+}
+
+// [[arrow::export]]
+void parquet___arrow___ArrowReaderProperties__set_use_threads(
+ const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties,
+ bool use_threads) {
+ properties->set_use_threads(use_threads);
+}
+
+// [[arrow::export]]
+bool parquet___arrow___ArrowReaderProperties__get_use_threads(
+ const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties,
+ bool use_threads) {
+ return properties->use_threads();
+}
+// [[arrow::export]]
+bool parquet___arrow___ArrowReaderProperties__get_read_dictionary(
+ const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties,
+ int column_index) {
+ return properties->read_dictionary(column_index);
+}
+
+// [[arrow::export]]
+void parquet___arrow___ArrowReaderProperties__set_read_dictionary(
+ const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& properties,
+ int column_index, bool read_dict) {
+ properties->set_read_dictionary(column_index, read_dict);
+}
+
+// [[arrow::export]]
+std::unique_ptr<parquet::arrow::FileReader> parquet___arrow___FileReader__OpenFile(
+ const std::shared_ptr<arrow::io::RandomAccessFile>& file,
+ const std::shared_ptr<parquet::arrow::ArrowReaderProperties>& props) {
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
- parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ parquet::arrow::OpenFile(file, arrow::default_memory_pool(), *props, &reader));
+ return reader;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable1(
+ const std::unique_ptr<parquet::arrow::FileReader>& reader) {
std::shared_ptr<arrow::Table> table;
PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
-
return table;
-#else
- Rcpp::stop("Support for Parquet is not available.");
+}
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> parquet___arrow___FileReader__ReadTable2(
+ const std::unique_ptr<parquet::arrow::FileReader>& reader,
+ const std::vector<int>& column_indices) {
std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(column_indices, &table));
return table;
-#endif
}
// [[arrow::export]]
void write_parquet_file(const std::shared_ptr<arrow::Table>& table,
std::string filename) {
-#ifdef ARROW_R_WITH_PARQUET
std::shared_ptr<arrow::io::OutputStream> sink;
PARQUET_THROW_NOT_OK(arrow::io::FileOutputStream::Open(filename, &sink));
PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(),
sink, table->num_rows()));
-#else
- Rcpp::stop("Support for Parquet is not available.");
-#endif
}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> parquet___arrow___FileReader__GetSchema2(
+ const std::unique_ptr<parquet::arrow::FileReader>& reader,
+ const std::vector<int>& indices) {
+ std::shared_ptr<arrow::Schema> schema;
+ STOP_IF_NOT_OK(reader->GetSchema(indices, &schema));
+ return schema;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> parquet___arrow___FileReader__GetSchema1(
+ const std::unique_ptr<parquet::arrow::FileReader>& reader) {
+ // FileReader does not have this exposed
+ // std::shared_ptr<arrow::Schema> schema;
+ // STOP_IF_NOT_OK(reader->GetSchema(&schema));
+
+ // so going indirectly about it
+ std::shared_ptr<arrow::RecordBatchReader> record_batch_reader;
+ STOP_IF_NOT_OK(reader->GetRecordBatchReader({}, &record_batch_reader));
+
+ return record_batch_reader->schema();
+}
+
#endif
diff --git a/r/src/table.cpp b/r/src/table.cpp
index b181c1c..2d9135b 100644
--- a/r/src/table.cpp
+++ b/r/src/table.cpp
@@ -63,6 +63,34 @@ std::vector<std::shared_ptr<arrow::Column>> Table__columns(
return res;
}
+// [[arrow::export]]
+Rcpp::CharacterVector Table__column_names(const std::shared_ptr<arrow::Table>& table) {
+ int nc = table->num_columns();
+ Rcpp::CharacterVector res(nc);
+ for (int i = 0; i < nc; i++) {
+ res[i] = table->column(i)->name();
+ }
+ return res;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__select(const std::shared_ptr<arrow::Table>& table,
+ const Rcpp::IntegerVector& indices) {
+ R_xlen_t n = indices.size();
+
+ std::vector<std::shared_ptr<arrow::Field>> fields(n);
+ std::vector<std::shared_ptr<arrow::Column>> columns(n);
+
+ for (R_xlen_t i = 0; i < n; i++) {
+ int pos = indices[i] - 1;
+ fields[i] = table->schema()->field(pos);
+ columns[i] = table->column(pos);
+ }
+
+ auto schema = std::make_shared<arrow::Schema>(std::move(fields));
+ return arrow::Table::Make(schema, columns);
+}
+
bool all_record_batches(SEXP lst) {
R_xlen_t n = XLENGTH(lst);
for (R_xlen_t i = 0; i < n; i++) {
diff --git a/r/tests/testthat/test-arrow-csv.R b/r/tests/testthat/test-arrow-csv.R
index ee09b60..7f0c1ae 100644
--- a/r/tests/testthat/test-arrow-csv.R
+++ b/r/tests/testthat/test-arrow-csv.R
@@ -22,9 +22,9 @@ test_that("Can read csv file", {
write.csv(iris, tf, row.names = FALSE, quote = FALSE)
- tab1 <- read_csv_arrow(tf)
- tab2 <- read_csv_arrow(mmap_open(tf))
- tab3 <- read_csv_arrow(ReadableFile(tf))
+ tab1 <- read_csv_arrow(tf, as_tibble = FALSE)
+ tab2 <- read_csv_arrow(mmap_open(tf), as_tibble = FALSE)
+ tab3 <- read_csv_arrow(ReadableFile(tf), as_tibble = FALSE)
iris$Species <- as.character(iris$Species)
tab0 <- table(!!!iris)
@@ -34,3 +34,34 @@ test_that("Can read csv file", {
unlink(tf)
})
+
+test_that("read_csv_arrow(as_tibble=TRUE)", {
+ tf <- tempfile()
+
+ write.csv(iris, tf, row.names = FALSE, quote = FALSE)
+
+ tab1 <- read_csv_arrow(tf, as_tibble = TRUE)
+ tab2 <- read_csv_arrow(mmap_open(tf), as_tibble = TRUE)
+ tab3 <- read_csv_arrow(ReadableFile(tf), as_tibble = TRUE)
+
+ iris$Species <- as.character(iris$Species)
+ expect_equivalent(iris, tab1)
+ expect_equivalent(iris, tab2)
+ expect_equivalent(iris, tab3)
+
+ unlink(tf)
+})
+
+test_that("read_csv_arrow() respects col_select", {
+ tf <- tempfile()
+
+ write.csv(iris, tf, row.names = FALSE, quote = FALSE)
+
+ tab <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_tibble = FALSE)
+ expect_equal(tab, table(Sepal.Length = iris$Sepal.Length, Sepal.Width = iris$Sepal.Width))
+
+ tib <- read_csv_arrow(tf, col_select = starts_with("Sepal"), as_tibble = TRUE)
+ expect_equal(tib, tibble::tibble(Sepal.Length = iris$Sepal.Length, Sepal.Width = iris$Sepal.Width))
+
+ unlink(tf)
+})
diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R
index 6d874b3..adf8151 100644
--- a/r/tests/testthat/test-feather.R
+++ b/r/tests/testthat/test-feather.R
@@ -62,14 +62,14 @@ test_that("feather read/write round trip", {
unlink(tf3)
})
-test_that("feather handles columns = <names>", {
+test_that("feather handles col_select = <names>", {
tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
tf1 <- tempfile()
write_feather(tib, tf1)
expect_true(fs::file_exists(tf1))
- tab1 <- read_feather(tf1, columns = c("x", "y"))
+ tab1 <- read_feather(tf1, col_select = c("x", "y"))
expect_is(tab1, "data.frame")
expect_equal(tib$x, tab1$x)
@@ -78,14 +78,14 @@ test_that("feather handles columns = <names>", {
unlink(tf1)
})
-test_that("feather handles columns = <integer>", {
+test_that("feather handles col_select = <integer>", {
tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
tf1 <- tempfile()
write_feather(tib, tf1)
expect_true(fs::file_exists(tf1))
- tab1 <- read_feather(tf1, columns = 1:2)
+ tab1 <- read_feather(tf1, col_select = 1:2)
expect_is(tab1, "data.frame")
expect_equal(tib$x, tab1$x)
@@ -93,6 +93,28 @@ test_that("feather handles columns = <integer>", {
unlink(tf1)
})
+test_that("feather handles col_select = <tidyselect helper>", {
+ tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+
+ tf1 <- tempfile()
+ write_feather(tib, tf1)
+ expect_true(fs::file_exists(tf1))
+
+ tab1 <- read_feather(tf1, col_select = everything())
+ expect_identical(tib, tab1)
+
+ tab2 <- read_feather(tf1, col_select = starts_with("x"))
+ expect_identical(tab2, tib[, "x", drop = FALSE])
+
+ tab3 <- read_feather(tf1, col_select = c(starts_with("x"), contains("y")))
+ expect_identical(tab3, tib[, c("x", "y"), drop = FALSE])
+
+ tab4 <- read_feather(tf1, col_select = -z)
+ expect_identical(tab4, tib[, c("x", "y"), drop = FALSE])
+
+ unlink(tf1)
+})
+
test_that("feather read/write round trip", {
tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
index 1521627..9413904 100644
--- a/r/tests/testthat/test-json.R
+++ b/r/tests/testthat/test-json.R
@@ -70,6 +70,22 @@ test_that("read_json_arrow() converts to tibble", {
expect_equal(tab1$yo, c("thing", NA, "\u5fcd", NA))
})
+test_that("read_json_arrow() supports col_select=", {
+ tf <- tempfile()
+ writeLines('
+ { "hello": 3.5, "world": false, "yo": "thing" }
+ { "hello": 3.25, "world": null }
+ { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+ { "hello": 0.0, "world": true, "yo": null }
+ ', tf)
+
+ tab1 <- read_json_arrow(tf, col_select = c(hello, world))
+ expect_equal(names(tab1), c("hello", "world"))
+
+ tab2 <- read_json_arrow(tf, col_select = 1:2)
+ expect_equal(names(tab2), c("hello", "world"))
+})
+
test_that("Can read json file with nested columns (ARROW-5503)", {
tf <- tempfile()
on.exit(unlink(tf))
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 554744e..f23c37f 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -35,3 +35,11 @@ test_that("simple int column roundtrip", {
df_read <- read_parquet(pq_tmp_file)
expect_identical(df, df_read)
})
+
+test_that("read_parquet() supports col_select", {
+ df <- read_parquet(pq_file, col_select = c(x, y, z))
+ expect_equal(names(df), c("x", "y", "z"))
+
+ df <- read_parquet(pq_file, col_select = starts_with("c"))
+ expect_equal(names(df), c("carat", "cut", "color", "clarity"))
+})