You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/10 15:51:48 UTC

[arrow] branch master updated: ARROW-3942: [R] Feather api fixes

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9c8ddae  ARROW-3942: [R] Feather api fixes
9c8ddae is described below

commit 9c8ddae11622ace00a187c46412309af82191b74
Author: Romain Francois <ro...@purrple.cat>
AuthorDate: Mon Dec 10 09:51:41 2018 -0600

    ARROW-3942: [R] Feather api fixes
    
    Some fixes  to follow up open #3043, and added the columns argument to `read_feather` that can be:
    
     - character vector
     - integer vector : 1-based in R
     - NULL: to get all columns (the default)
    
    Also adds `as_tibble` argument to read_feather to switch between data.frame and arrow::Table return value
    
    Author: Romain Francois <ro...@purrple.cat>
    
    Closes #3106 from romainfrancois/ARROW-3942/feather and squashes the following commits:
    
    13061af4d <Romain Francois> fixed link in documentation
    ce414c153 <Romain Francois> + as_tibble argument to read_feather()
    d6c30a38b <Romain Francois> + columns argument to read_feather()
    46a6fbb69 <Romain Francois> Update feather factories
---
 r/NAMESPACE                                        | 16 +++---
 r/R/RcppExports.R                                  |  4 +-
 r/R/feather.R                                      | 44 +++++++++-------
 ...ather_table_reader.Rd => FeatherTableReader.Rd} |  6 +--
 ...ather_table_writer.Rd => FeatherTableWriter.Rd} |  6 +--
 r/man/read_feather.Rd                              | 10 ++--
 r/src/RcppExports.cpp                              |  9 ++--
 r/src/feather.cpp                                  | 32 +++++++++++-
 r/tests/testthat/test-feather.R                    | 59 ++++++++++++++++++----
 9 files changed, 134 insertions(+), 52 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index cc5961e..65d60d8 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -8,6 +8,12 @@ S3method("==","arrow::RecordBatch")
 S3method("==","arrow::ipc::Message")
 S3method(BufferReader,"arrow::Buffer")
 S3method(BufferReader,default)
+S3method(FeatherTableReader,"arrow::io::RandomAccessFile")
+S3method(FeatherTableReader,"arrow::ipc::feather::TableReader")
+S3method(FeatherTableReader,character)
+S3method(FeatherTableReader,default)
+S3method(FeatherTableReader,fs_path)
+S3method(FeatherTableWriter,"arrow::io::OutputStream")
 S3method(FixedSizeBufferWriter,"arrow::Buffer")
 S3method(FixedSizeBufferWriter,default)
 S3method(MessageReader,"arrow::io::InputStream")
@@ -33,12 +39,6 @@ S3method(buffer,default)
 S3method(buffer,integer)
 S3method(buffer,numeric)
 S3method(buffer,raw)
-S3method(feather_table_reader,"arrow::io::RandomAccessFile")
-S3method(feather_table_reader,"arrow::ipc::feather::TableReader")
-S3method(feather_table_reader,character)
-S3method(feather_table_reader,default)
-S3method(feather_table_reader,fs_path)
-S3method(feather_table_writer,"arrow::io::OutputStream")
 S3method(length,"arrow::Array")
 S3method(names,"arrow::RecordBatch")
 S3method(print,"arrow-enum")
@@ -70,6 +70,8 @@ S3method(write_feather_RecordBatch,fs_path)
 export(BufferOutputStream)
 export(BufferReader)
 export(DateUnit)
+export(FeatherTableReader)
+export(FeatherTableWriter)
 export(FileMode)
 export(FileOutputStream)
 export(FixedSizeBufferWriter)
@@ -95,8 +97,6 @@ export(date64)
 export(decimal)
 export(default_memory_pool)
 export(dictionary)
-export(feather_table_reader)
-export(feather_table_writer)
 export(field)
 export(float16)
 export(float32)
diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R
index ccf8549..0310eab 100644
--- a/r/R/RcppExports.R
+++ b/r/R/RcppExports.R
@@ -445,8 +445,8 @@ ipc___feather___TableReader__GetColumn <- function(reader, i) {
     .Call(`_arrow_ipc___feather___TableReader__GetColumn`, reader, i)
 }
 
-ipc___feather___TableReader__Read <- function(reader) {
-    .Call(`_arrow_ipc___feather___TableReader__Read`, reader)
+ipc___feather___TableReader__Read <- function(reader, columns) {
+    .Call(`_arrow_ipc___feather___TableReader__Read`, reader, columns)
 }
 
 ipc___feather___TableReader__Open <- function(stream) {
diff --git a/r/R/feather.R b/r/R/feather.R
index bae71d3..0646521 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -35,7 +35,9 @@
     num_columns = function() ipc___feather___TableReader__num_columns(self),
     GetColumnName = function(i) ipc___feather___TableReader__GetColumnName(self, i),
     GetColumn = function(i) shared_ptr(`arrow::Column`, ipc___feather___TableReader__GetColumn(self, i)),
-    Read = function() shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self))
+    Read = function(columns) {
+      shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self, columns))
+    }
   )
 )
 
@@ -44,12 +46,12 @@
 #' @param stream an OutputStream
 #'
 #' @export
-feather_table_writer <- function(stream) {
-  UseMethod("feather_table_writer")
+FeatherTableWriter <- function(stream) {
+  UseMethod("FeatherTableWriter")
 }
 
 #' @export
-`feather_table_writer.arrow::io::OutputStream` <- function(stream){
+`FeatherTableWriter.arrow::io::OutputStream` <- function(stream){
   unique_ptr(`arrow::ipc::feather::TableWriter`, ipc___feather___TableWriter__Open(stream))
 }
 
@@ -107,7 +109,7 @@ write_feather_RecordBatch <- function(data, stream) {
 #' @export
 #' @method write_feather_RecordBatch arrow::io::OutputStream
 `write_feather_RecordBatch.arrow::io::OutputStream` <- function(data, stream) {
-  ipc___TableWriter__RecordBatch__WriteFeather(feather_table_writer(stream), data)
+  ipc___TableWriter__RecordBatch__WriteFeather(FeatherTableWriter(stream), data)
 }
 
 #' A arrow::ipc::feather::TableReader to read from a file
@@ -117,44 +119,50 @@ write_feather_RecordBatch <- function(data, stream) {
 #' @param ... extra parameters
 #'
 #' @export
-feather_table_reader <- function(file, mmap = TRUE, ...){
-  UseMethod("feather_table_reader")
+FeatherTableReader <- function(file, mmap = TRUE, ...){
+  UseMethod("FeatherTableReader")
 }
 
 #' @export
-feather_table_reader.default <- function(file, mmap = TRUE, ...) {
+FeatherTableReader.default <- function(file, mmap = TRUE, ...) {
   stop("unsupported")
 }
 
 #' @export
-feather_table_reader.character <- function(file, mmap = TRUE, ...) {
-  feather_table_reader(fs::path_abs(file), mmap = mmap, ...)
+FeatherTableReader.character <- function(file, mmap = TRUE, ...) {
+  FeatherTableReader(fs::path_abs(file), mmap = mmap, ...)
 }
 
 #' @export
-feather_table_reader.fs_path <- function(file, mmap = TRUE, ...) {
+FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) {
   stream <- if(isTRUE(mmap)) mmap_open(file, ...) else ReadableFile(file, ...)
-  feather_table_reader(stream)
+  FeatherTableReader(stream)
 }
 
 #' @export
-`feather_table_reader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){
+`FeatherTableReader.arrow::io::RandomAccessFile` <- function(file, mmap = TRUE, ...){
   unique_ptr(`arrow::ipc::feather::TableReader`, ipc___feather___TableReader__Open(file))
 }
 
 #' @export
-`feather_table_reader.arrow::ipc::feather::TableReader` <- function(file, mmap = TRUE, ...){
+`FeatherTableReader.arrow::ipc::feather::TableReader` <- function(file, mmap = TRUE, ...){
   file
 }
 
 #' Read a feather file
 #'
-#' @param file a arrow::ipc::feather::TableReader or whatever the [feather_table_reader()] function can handle
+#' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle
+#' @param columns names if the columns to read. The default `NULL` means all columns
+#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble.
 #' @param ... additional parameters
 #'
-#' @return an arrow::Table
+#' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise
 #'
 #' @export
-read_feather <- function(file, ...){
-  feather_table_reader(file, ...)$Read()
+read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){
+  out <- FeatherTableReader(file, ...)$Read(columns)
+  if (isTRUE(as_tibble)) {
+    out <- as_tibble(out)
+  }
+  out
 }
diff --git a/r/man/feather_table_reader.Rd b/r/man/FeatherTableReader.Rd
similarity index 80%
rename from r/man/feather_table_reader.Rd
rename to r/man/FeatherTableReader.Rd
index fb1c534..15a260b 100644
--- a/r/man/feather_table_reader.Rd
+++ b/r/man/FeatherTableReader.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/feather.R
-\name{feather_table_reader}
-\alias{feather_table_reader}
+\name{FeatherTableReader}
+\alias{FeatherTableReader}
 \title{A arrow::ipc::feather::TableReader to read from a file}
 \usage{
-feather_table_reader(file, mmap = TRUE, ...)
+FeatherTableReader(file, mmap = TRUE, ...)
 }
 \arguments{
 \item{file}{A file path, arrow::io::RandomAccessFile}
diff --git a/r/man/feather_table_writer.Rd b/r/man/FeatherTableWriter.Rd
similarity index 74%
rename from r/man/feather_table_writer.Rd
rename to r/man/FeatherTableWriter.Rd
index 36035ac..3acf597 100644
--- a/r/man/feather_table_writer.Rd
+++ b/r/man/FeatherTableWriter.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/feather.R
-\name{feather_table_writer}
-\alias{feather_table_writer}
+\name{FeatherTableWriter}
+\alias{FeatherTableWriter}
 \title{Create TableWriter that writes into a stream}
 \usage{
-feather_table_writer(stream)
+FeatherTableWriter(stream)
 }
 \arguments{
 \item{stream}{an OutputStream}
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index e86b86b..31fd36a 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -4,15 +4,19 @@
 \alias{read_feather}
 \title{Read a feather file}
 \usage{
-read_feather(file, ...)
+read_feather(file, columns = NULL, as_tibble = TRUE, ...)
 }
 \arguments{
-\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=feather_table_reader]{feather_table_reader()}} function can handle}
+\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle}
+
+\item{columns}{names if the columns to read. The default \code{NULL} means all columns}
+
+\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.}
 
 \item{...}{additional parameters}
 }
 \value{
-an arrow::Table
+a data frame if \code{as_tibble} is \code{TRUE} (the default), or a \link[=arrow__Table]{arrow::Table} otherwise
 }
 \description{
 Read a feather file
diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp
index bca4eaf..e5a784e 100644
--- a/r/src/RcppExports.cpp
+++ b/r/src/RcppExports.cpp
@@ -1244,13 +1244,14 @@ BEGIN_RCPP
 END_RCPP
 }
 // ipc___feather___TableReader__Read
-std::shared_ptr<arrow::Table> ipc___feather___TableReader__Read(const std::unique_ptr<arrow::ipc::feather::TableReader>& reader);
-RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP readerSEXP) {
+std::shared_ptr<arrow::Table> ipc___feather___TableReader__Read(const std::unique_ptr<arrow::ipc::feather::TableReader>& reader, SEXP columns);
+RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP readerSEXP, SEXP columnsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< const std::unique_ptr<arrow::ipc::feather::TableReader>& >::type reader(readerSEXP);
-    rcpp_result_gen = Rcpp::wrap(ipc___feather___TableReader__Read(reader));
+    Rcpp::traits::input_parameter< SEXP >::type columns(columnsSEXP);
+    rcpp_result_gen = Rcpp::wrap(ipc___feather___TableReader__Read(reader, columns));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -2262,7 +2263,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_arrow_ipc___feather___TableReader__num_columns", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_columns, 1},
     {"_arrow_ipc___feather___TableReader__GetColumnName", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumnName, 2},
     {"_arrow_ipc___feather___TableReader__GetColumn", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumn, 2},
-    {"_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 1},
+    {"_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 2},
     {"_arrow_ipc___feather___TableReader__Open", (DL_FUNC) &_arrow_ipc___feather___TableReader__Open, 1},
     {"_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3},
     {"_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1},
diff --git a/r/src/feather.cpp b/r/src/feather.cpp
index 7b84dee..8389156 100644
--- a/r/src/feather.cpp
+++ b/r/src/feather.cpp
@@ -115,9 +115,37 @@ std::shared_ptr<arrow::Column> ipc___feather___TableReader__GetColumn(
 
 // [[Rcpp::export]]
 std::shared_ptr<arrow::Table> ipc___feather___TableReader__Read(
-    const std::unique_ptr<arrow::ipc::feather::TableReader>& reader) {
+    const std::unique_ptr<arrow::ipc::feather::TableReader>& reader, SEXP columns) {
   std::shared_ptr<arrow::Table> table;
-  STOP_IF_NOT_OK(reader->Read(&table));
+
+  switch (TYPEOF(columns)) {
+    case INTSXP: {
+      R_xlen_t n = XLENGTH(columns);
+      std::vector<int> indices(n);
+      int* p_columns = INTEGER(columns);
+      for (int i = 0; i < n; i++) {
+        indices[i] = p_columns[i] - 1;
+      }
+      STOP_IF_NOT_OK(reader->Read(indices, &table));
+      break;
+    }
+    case STRSXP: {
+      R_xlen_t n = XLENGTH(columns);
+      std::vector<std::string> names(n);
+      for (R_xlen_t i = 0; i < n; i++) {
+        names[i] = CHAR(STRING_ELT(columns, i));
+      }
+      STOP_IF_NOT_OK(reader->Read(names, &table));
+      break;
+    }
+    case NILSXP:
+      STOP_IF_NOT_OK(reader->Read(&table));
+      break;
+    default:
+      Rcpp::stop("incompatible column specification");
+      break;
+  };
+
   return table;
 }
 
diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R
index 715017f..23fdc58 100644
--- a/r/tests/testthat/test-feather.R
+++ b/r/tests/testthat/test-feather.R
@@ -34,25 +34,66 @@ test_that("feather read/write round trip", {
   expect_true(fs::file_exists(tf3))
 
   tab1 <- read_feather(tf1)
-  expect_is(tab1, "arrow::Table")
+  expect_is(tab1, "data.frame")
 
   tab2 <- read_feather(tf2)
-  expect_is(tab2, "arrow::Table")
+  expect_is(tab2, "data.frame")
 
   tab3 <- read_feather(tf3)
-  expect_is(tab3, "arrow::Table")
+  expect_is(tab3, "data.frame")
 
   # reading directly from arrow::io::MemoryMappedFile
   tab4 <- read_feather(mmap_open(tf3))
-  expect_is(tab4, "arrow::Table")
+  expect_is(tab4, "data.frame")
 
   # reading directly from arrow::io::ReadableFile
   tab5 <- read_feather(ReadableFile(tf3))
-  expect_is(tab5, "arrow::Table")
+  expect_is(tab5, "data.frame")
+
+  expect_equal(tib, tab1)
+  expect_equal(tib, tab2)
+  expect_equal(tib, tab3)
+  expect_equal(tib, tab4)
+  expect_equal(tib, tab5)
+})
+
+test_that("feather handles columns = <names>", {
+  tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+
+  tf1 <- local_tempfile()
+  write_feather(tib, tf1)
+  expect_true(fs::file_exists(tf1))
+
+  tab1 <- read_feather(tf1, columns = c("x", "y"))
+  expect_is(tab1, "data.frame")
+
+  expect_equal(tib[, c("x", "y")], as_tibble(tab1))
+})
+
+test_that("feather handles columns = <integer>", {
+  tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+
+  tf1 <- local_tempfile()
+  write_feather(tib, tf1)
+  expect_true(fs::file_exists(tf1))
+
+  tab1 <- read_feather(tf1, columns = 1:2)
+  expect_is(tab1, "data.frame")
+
+  expect_equal(tib[, c("x", "y")], as_tibble(tab1))
+})
+
+test_that("feather read/write round trip", {
+  tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10])
+
+  tf1 <- local_tempfile()
+  write_feather(tib, tf1)
+  expect_true(fs::file_exists(tf1))
+
+  tab1 <- read_feather(tf1, as_tibble = FALSE)
+  expect_is(tab1, "arrow::Table")
 
   expect_equal(tib, as_tibble(tab1))
-  expect_equal(tib, as_tibble(tab2))
-  expect_equal(tib, as_tibble(tab3))
-  expect_equal(tib, as_tibble(tab4))
-  expect_equal(tib, as_tibble(tab5))
 })
+
+