You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ro...@apache.org on 2019/06/18 17:28:17 UTC

[arrow] branch master updated: ARROW-5586: [R] convert Array of LIST type to R lists

This is an automated email from the ASF dual-hosted git repository.

romainfrancois pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2106ddc  ARROW-5586: [R] convert Array of LIST type to R lists
2106ddc is described below

commit 2106ddc7ad9dd455d21da3ef25a5ac88f6fadc29
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Tue Jun 18 19:28:05 2019 +0200

    ARROW-5586: [R] convert Array of LIST type to R lists
    
    We don't have the reverse operation yet (convert from an R data structure) to a list array, so those aren't easy to make, but e.g. we can get some with the json reader:
    
    ``` r
    library(arrow, warn.conflicts = FALSE)
    
    tf <- tempfile()
    writeLines('
        { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
        { "hello": 3.25, "world": null, "arr": [2], "nuf": null }
        { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
        { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
      ', tf)
    
    tab1 <- read_json_arrow(tf, as_tibble = FALSE)
    list_array <- tab1$column(3L)$data()$chunk(0)
    list_array
    #> arrow::ListArray
    #> [
    #>   [
    #>     1,
    #>     2,
    #>     3
    #>   ],
    #>   [
    #>     2
    #>   ],
    #>   [],
    #>   null
    #> ]
    list_array$values
    #> function() `arrow::Array`$dispatch(ListArray__values(self))
    #> <environment: 0x7f93c3b5cf88>
    list_array$value_length(0)
    #> [1] 3
    list_array$value_offset(0)
    #> [1] 0
    list_array$raw_value_offsets()
    #> [1] 0 3 4 4
    
    list_array$as_vector()
    #> [[1]]
    #> integer64
    #> [1] 1 2 3
    #>
    #> [[2]]
    #> integer64
    #> [1] 2
    #>
    #> [[3]]
    #> integer64
    #> character(0)
    #>
    #> [[4]]
    #> NULL
    ```
    
    <sup>Created on 2019-06-14 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000)</sup>
    
    Author: Romain Francois <ro...@rstudio.com>
    
    Closes #4575 from romainfrancois/ARROW-5586/List_type and squashes the following commits:
    
    300f897d <Romain Francois> merge tests
    5b62e286 <Romain Francois> +ListType::value_(field|type)
    375d3b53 <Romain Francois> ListArray to R list
    3557362e <Romain Francois> more methods for ListArray
    b63b705f <Romain Francois> + class ListArray, ListArray$value_type
---
 r/R/List.R                       |   6 ++-
 r/R/array.R                      |  14 +++++
 r/R/arrowExports.R               |  28 ++++++++++
 r/src/array.cpp                  |  31 +++++++++++
 r/src/array__to_vector.cpp       |  45 ++++++++++++++++
 r/src/arrowExports.cpp           | 114 +++++++++++++++++++++++++++++++++++++++
 r/src/datatype.cpp               |  12 +++++
 r/tests/testthat/test-DataType.R |   2 +
 r/tests/testthat/test-json.R     |  39 ++++++++++----
 9 files changed, 281 insertions(+), 10 deletions(-)

diff --git a/r/R/List.R b/r/R/List.R
index cc8c2b1..efd8839 100644
--- a/r/R/List.R
+++ b/r/R/List.R
@@ -18,7 +18,11 @@
 #' @include R6.R
 
 `arrow::ListType` <- R6Class("arrow::ListType",
-  inherit = `arrow::NestedType`
+  inherit = `arrow::NestedType`,
+  active = list(
+    value_field = function() shared_ptr(`arrow::Field`, ListType__value_field(self)),
+    value_type = function() `arrow::DataType`$dispatch(ListType__value_type(self))
+  )
 )
 
 #' @rdname DataType
diff --git a/r/R/array.R b/r/R/array.R
index b6e21ef..7e5e955 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -118,12 +118,26 @@
   )
 )
 
+`arrow::ListArray` <- R6Class("arrow::ListArray", inherit = `arrow::Array`,
+  public = list(
+    values = function() `arrow::Array`$dispatch(ListArray__values(self)),
+    value_length = function(i) ListArray__value_length(self, i),
+    value_offset = function(i) ListArray__value_offset(self, i),
+    raw_value_offsets = function() ListArray__raw_value_offsets(self)
+  ),
+  active = list(
+    value_type = function() `arrow::DataType`$dispatch(ListArray__value_type(self))
+  )
+)
+
 `arrow::Array`$dispatch <- function(xp){
   a <- shared_ptr(`arrow::Array`, xp)
   if(a$type_id() == Type$DICTIONARY){
     a <- shared_ptr(`arrow::DictionaryArray`, xp)
   } else if (a$type_id() == Type$STRUCT) {
     a <- shared_ptr(`arrow::StructArray`, xp)
+  } else if(a$type_id() == Type$LIST) {
+    a <- shared_ptr(`arrow::ListArray`, xp)
   }
   a
 }
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index a6fd72d..908c50a 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -80,6 +80,26 @@ StructArray__Flatten <- function(array){
     .Call(`_arrow_StructArray__Flatten` , array)
 }
 
+ListArray__value_type <- function(array){
+    .Call(`_arrow_ListArray__value_type` , array)
+}
+
+ListArray__values <- function(array){
+    .Call(`_arrow_ListArray__values` , array)
+}
+
+ListArray__value_length <- function(array, i){
+    .Call(`_arrow_ListArray__value_length` , array, i)
+}
+
+ListArray__value_offset <- function(array, i){
+    .Call(`_arrow_ListArray__value_offset` , array, i)
+}
+
+ListArray__raw_value_offsets <- function(array){
+    .Call(`_arrow_ListArray__raw_value_offsets` , array)
+}
+
 Array__as_vector <- function(array){
     .Call(`_arrow_Array__as_vector` , array)
 }
@@ -456,6 +476,14 @@ StructType__GetFieldIndex <- function(type, name){
     .Call(`_arrow_StructType__GetFieldIndex` , type, name)
 }
 
+ListType__value_field <- function(type){
+    .Call(`_arrow_ListType__value_field` , type)
+}
+
+ListType__value_type <- function(type){
+    .Call(`_arrow_ListType__value_type` , type)
+}
+
 ipc___feather___TableWriter__SetDescription <- function(writer, description){
     invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description))
 }
diff --git a/r/src/array.cpp b/r/src/array.cpp
index 35da4b1..7e4fa6f 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -140,4 +140,35 @@ arrow::ArrayVector StructArray__Flatten(
   return out;
 }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ListArray__value_type(
+    const std::shared_ptr<arrow::ListArray>& array) {
+  return array->value_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ListArray__values(
+    const std::shared_ptr<arrow::ListArray>& array) {
+  return array->values();
+}
+
+// [[arrow::export]]
+int32_t ListArray__value_length(const std::shared_ptr<arrow::ListArray>& array,
+                                int64_t i) {
+  return array->value_length(i);
+}
+
+// [[arrow::export]]
+int32_t ListArray__value_offset(const std::shared_ptr<arrow::ListArray>& array,
+                                int64_t i) {
+  return array->value_offset(i);
+}
+
+// [[arrow::export]]
+Rcpp::IntegerVector ListArray__raw_value_offsets(
+    const std::shared_ptr<arrow::ListArray>& array) {
+  auto offsets = array->raw_value_offsets();
+  return Rcpp::IntegerVector(offsets, offsets + array->length());
+}
+
 #endif
diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp
index 4e26f8d..7fcb02b 100644
--- a/r/src/array__to_vector.cpp
+++ b/r/src/array__to_vector.cpp
@@ -547,6 +547,47 @@ class Converter_Decimal : public Converter {
   }
 };
 
+class Converter_List : public Converter {
+ public:
+  explicit Converter_List(const ArrayVector& arrays) : Converter(arrays) {}
+
+  SEXP Allocate(R_xlen_t n) const { return Rcpp::List(no_init(n)); }
+
+  Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+    // nothing to do, list contain NULL by default
+    return Status::OK();
+  }
+
+  Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+                           R_xlen_t start, R_xlen_t n) const {
+    using internal::checked_cast;
+    auto list_array = checked_cast<arrow::ListArray*>(array.get());
+    auto values_array = list_array->values();
+
+    auto ingest_one = [&](R_xlen_t i) {
+      auto slice =
+          values_array->Slice(list_array->value_offset(i), list_array->value_length(i));
+      SET_VECTOR_ELT(data, i + start, Array__as_vector(slice));
+    };
+
+    if (array->null_count()) {
+      internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(),
+                                           n);
+
+      for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next()) {
+        if (bitmap_reader.IsSet()) ingest_one(i);
+      }
+
+    } else {
+      for (R_xlen_t i = 0; i < n; i++) {
+        ingest_one(i);
+      }
+    }
+
+    return Status::OK();
+  }
+};
+
 class Converter_Int64 : public Converter {
  public:
   explicit Converter_Int64(const ArrayVector& arrays) : Converter(arrays) {}
@@ -658,9 +699,13 @@ std::shared_ptr<Converter> Converter::Make(const ArrayVector& arrays) {
     case Type::DECIMAL:
       return std::make_shared<arrow::r::Converter_Decimal>(arrays);
 
+      // nested
     case Type::STRUCT:
       return std::make_shared<arrow::r::Converter_Struct>(arrays);
 
+    case Type::LIST:
+      return std::make_shared<arrow::r::Converter_List>(arrays);
+
     default:
       break;
   }
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 8d5ed3e..31e27ea 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -317,6 +317,83 @@ RcppExport SEXP _arrow_StructArray__Flatten(SEXP array_sexp){
 }
 #endif
 
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ListArray__value_type(const std::shared_ptr<arrow::ListArray>& array);
+RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+	return Rcpp::wrap(ListArray__value_type(array));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){
+	Rf_error("Cannot call ListArray__value_type(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ListArray__values(const std::shared_ptr<arrow::ListArray>& array);
+RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+	return Rcpp::wrap(ListArray__values(array));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){
+	Rf_error("Cannot call ListArray__values(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t ListArray__value_length(const std::shared_ptr<arrow::ListArray>& array, int64_t i);
+RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+	Rcpp::traits::input_parameter<int64_t>::type i(i_sexp);
+	return Rcpp::wrap(ListArray__value_length(array, i));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+	Rf_error("Cannot call ListArray__value_length(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t ListArray__value_offset(const std::shared_ptr<arrow::ListArray>& array, int64_t i);
+RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+	Rcpp::traits::input_parameter<int64_t>::type i(i_sexp);
+	return Rcpp::wrap(ListArray__value_offset(array, i));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+	Rf_error("Cannot call ListArray__value_offset(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+Rcpp::IntegerVector ListArray__raw_value_offsets(const std::shared_ptr<arrow::ListArray>& array);
+RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+	return Rcpp::wrap(ListArray__raw_value_offsets(array));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){
+	Rf_error("Cannot call ListArray__raw_value_offsets(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
 // array__to_vector.cpp
 #if defined(ARROW_R_WITH_ARROW)
 SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
@@ -1743,6 +1820,36 @@ RcppExport SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp)
 }
 #endif
 
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> ListType__value_field(const std::shared_ptr<arrow::ListType>& type);
+RcppExport SEXP _arrow_ListType__value_field(SEXP type_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+	return Rcpp::wrap(ListType__value_field(type));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListType__value_field(SEXP type_sexp){
+	Rf_error("Cannot call ListType__value_field(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ListType__value_type(const std::shared_ptr<arrow::ListType>& type);
+RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+	return Rcpp::wrap(ListType__value_type(type));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){
+	Rf_error("Cannot call ListType__value_type(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
 // feather.cpp
 #if defined(ARROW_R_WITH_ARROW)
 void ipc___feather___TableWriter__SetDescription(const std::unique_ptr<arrow::ipc::feather::TableWriter>& writer, const std::string& description);
@@ -3388,6 +3495,11 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2}, 
 		{ "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2}, 
 		{ "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1}, 
+		{ "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1}, 
+		{ "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1}, 
+		{ "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2}, 
+		{ "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2}, 
+		{ "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1}, 
 		{ "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1}, 
 		{ "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 1}, 
 		{ "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2}, 
@@ -3482,6 +3594,8 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1}, 
 		{ "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2}, 
 		{ "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2}, 
+		{ "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1}, 
+		{ "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1}, 
 		{ "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, 
 		{ "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, 
 		{ "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3}, 
diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp
index 18920f2..f4a4b09 100644
--- a/r/src/datatype.cpp
+++ b/r/src/datatype.cpp
@@ -281,4 +281,16 @@ int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type,
   return type->GetFieldIndex(name);
 }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> ListType__value_field(
+    const std::shared_ptr<arrow::ListType>& type) {
+  return type->value_field();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ListType__value_type(
+    const std::shared_ptr<arrow::ListType>& type) {
+  return type->value_type();
+}
+
 #endif
diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R
index 6f77b3b..dfc0d53 100644
--- a/r/tests/testthat/test-DataType.R
+++ b/r/tests/testthat/test-DataType.R
@@ -297,6 +297,8 @@ test_that("list type works as expected", {
     x$children(),
     list(field("item", int32()))
   )
+  expect_equal(x$value_type, int32())
+  expect_equal(x$value_field, field("item", int32()))
 })
 
 test_that("struct type works as expected", {
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
index 38b20a8..206dfdd 100644
--- a/r/tests/testthat/test-json.R
+++ b/r/tests/testthat/test-json.R
@@ -75,12 +75,12 @@ test_that("read_json_arrow() converts to tibble", {
 test_that("Can read json file with nested columns (ARROW-5503)", {
   tf <- tempfile()
   writeLines('
-    { "nuf": {} }
-    { "nuf": null }
-    { "nuf": { "ps": 78.0, "hello": "hi" } }
-    { "nuf": { "ps": 90.0, "hello": "bonjour" } }
-    { "nuf": { "hello": "ciao" } }
-    { "nuf": { "ps": 19 } }
+    { "arr": [1.0, 2.0, 3.0], "nuf": {} }
+    { "arr": [2.0], "nuf": null }
+    { "arr": [], "nuf": { "ps": 78.0, "hello": "hi" } }
+    { "arr": null, "nuf": { "ps": 90.0, "hello": "bonjour" } }
+    { "arr": [5.0], "nuf": { "hello": "ciao" } }
+    { "arr": [5.0, 6.0], "nuf": { "ps": 19 } }
   ', tf)
 
   tab1 <- read_json_arrow(tf, as_tibble = FALSE)
@@ -93,11 +93,12 @@ test_that("Can read json file with nested columns (ARROW-5503)", {
   expect_equal(
     tab1$schema,
     schema(
+      arr = list_of(float64()),
       nuf = struct(ps = float64(), hello = utf8())
     )
   )
 
-  struct_array <- tab1$column(0)$data()$chunk(0)
+  struct_array <- tab1$column(1)$data()$chunk(0)
   ps <- array(c(NA, NA, 78, 90, NA, 19))
   hello <- array(c(NA, NA, "hi", "bonjour", "ciao", NA))
   expect_equal(struct_array$field(0L), ps)
@@ -108,8 +109,28 @@ test_that("Can read json file with nested columns (ARROW-5503)", {
     data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE)
   )
 
-  # cannot yet test list and struct types in R api
-  # tib <- as.data.frame(tab1)
+  list_array_r <- list(
+    c(1, 2, 3),
+    c(2),
+    numeric(),
+    NULL,
+    5,
+    c(5, 6)
+  )
+  list_array <- tab1$column(0)$data()
+  expect_identical(
+    list_array$as_vector(),
+    list_array_r
+  )
+
+  tib <- as.data.frame(tab1)
+  expect_identical(
+    tib,
+    tibble::tibble(
+      arr = list_array_r,
+      nuf = data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE)
+    )
+  )
 
   unlink(tf)
 })