You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ro...@apache.org on 2019/06/18 17:28:17 UTC
[arrow] branch master updated: ARROW-5586: [R] convert Array of
LIST type to R lists
This is an automated email from the ASF dual-hosted git repository.
romainfrancois pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2106ddc ARROW-5586: [R] convert Array of LIST type to R lists
2106ddc is described below
commit 2106ddc7ad9dd455d21da3ef25a5ac88f6fadc29
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Tue Jun 18 19:28:05 2019 +0200
ARROW-5586: [R] convert Array of LIST type to R lists
We don't have the reverse operation yet (convert from an R data structure) to a list array, so those aren't easy to make, but e.g. we can get some with the json reader:
``` r
library(arrow, warn.conflicts = FALSE)
tf <- tempfile()
writeLines('
{ "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
{ "hello": 3.25, "world": null, "arr": [2], "nuf": null }
{ "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
{ "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
', tf)
tab1 <- read_json_arrow(tf, as_tibble = FALSE)
list_array <- tab1$column(3L)$data()$chunk(0)
list_array
#> arrow::ListArray
#> [
#> [
#> 1,
#> 2,
#> 3
#> ],
#> [
#> 2
#> ],
#> [],
#> null
#> ]
list_array$values
#> function() `arrow::Array`$dispatch(ListArray__values(self))
#> <environment: 0x7f93c3b5cf88>
list_array$value_length(0)
#> [1] 3
list_array$value_offset(0)
#> [1] 0
list_array$raw_value_offsets()
#> [1] 0 3 4 4
list_array$as_vector()
#> [[1]]
#> integer64
#> [1] 1 2 3
#>
#> [[2]]
#> integer64
#> [1] 2
#>
#> [[3]]
#> integer64
#> character(0)
#>
#> [[4]]
#> NULL
```
<sup>Created on 2019-06-14 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0.9000)</sup>
Author: Romain Francois <ro...@rstudio.com>
Closes #4575 from romainfrancois/ARROW-5586/List_type and squashes the following commits:
300f897d <Romain Francois> merge tests
5b62e286 <Romain Francois> +ListType::value_(field|type)
375d3b53 <Romain Francois> ListArray to R list
3557362e <Romain Francois> more methods for ListArray
b63b705f <Romain Francois> + class ListArray, ListArray$value_type
---
r/R/List.R | 6 ++-
r/R/array.R | 14 +++++
r/R/arrowExports.R | 28 ++++++++++
r/src/array.cpp | 31 +++++++++++
r/src/array__to_vector.cpp | 45 ++++++++++++++++
r/src/arrowExports.cpp | 114 +++++++++++++++++++++++++++++++++++++++
r/src/datatype.cpp | 12 +++++
r/tests/testthat/test-DataType.R | 2 +
r/tests/testthat/test-json.R | 39 ++++++++++----
9 files changed, 281 insertions(+), 10 deletions(-)
diff --git a/r/R/List.R b/r/R/List.R
index cc8c2b1..efd8839 100644
--- a/r/R/List.R
+++ b/r/R/List.R
@@ -18,7 +18,11 @@
#' @include R6.R
`arrow::ListType` <- R6Class("arrow::ListType",
- inherit = `arrow::NestedType`
+ inherit = `arrow::NestedType`,
+ active = list(
+ value_field = function() shared_ptr(`arrow::Field`, ListType__value_field(self)),
+ value_type = function() `arrow::DataType`$dispatch(ListType__value_type(self))
+ )
)
#' @rdname DataType
diff --git a/r/R/array.R b/r/R/array.R
index b6e21ef..7e5e955 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -118,12 +118,26 @@
)
)
+`arrow::ListArray` <- R6Class("arrow::ListArray", inherit = `arrow::Array`,
+ public = list(
+ values = function() `arrow::Array`$dispatch(ListArray__values(self)),
+ value_length = function(i) ListArray__value_length(self, i),
+ value_offset = function(i) ListArray__value_offset(self, i),
+ raw_value_offsets = function() ListArray__raw_value_offsets(self)
+ ),
+ active = list(
+ value_type = function() `arrow::DataType`$dispatch(ListArray__value_type(self))
+ )
+)
+
`arrow::Array`$dispatch <- function(xp){
a <- shared_ptr(`arrow::Array`, xp)
if(a$type_id() == Type$DICTIONARY){
a <- shared_ptr(`arrow::DictionaryArray`, xp)
} else if (a$type_id() == Type$STRUCT) {
a <- shared_ptr(`arrow::StructArray`, xp)
+ } else if(a$type_id() == Type$LIST) {
+ a <- shared_ptr(`arrow::ListArray`, xp)
}
a
}
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index a6fd72d..908c50a 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -80,6 +80,26 @@ StructArray__Flatten <- function(array){
.Call(`_arrow_StructArray__Flatten` , array)
}
+ListArray__value_type <- function(array){
+ .Call(`_arrow_ListArray__value_type` , array)
+}
+
+ListArray__values <- function(array){
+ .Call(`_arrow_ListArray__values` , array)
+}
+
+ListArray__value_length <- function(array, i){
+ .Call(`_arrow_ListArray__value_length` , array, i)
+}
+
+ListArray__value_offset <- function(array, i){
+ .Call(`_arrow_ListArray__value_offset` , array, i)
+}
+
+ListArray__raw_value_offsets <- function(array){
+ .Call(`_arrow_ListArray__raw_value_offsets` , array)
+}
+
Array__as_vector <- function(array){
.Call(`_arrow_Array__as_vector` , array)
}
@@ -456,6 +476,14 @@ StructType__GetFieldIndex <- function(type, name){
.Call(`_arrow_StructType__GetFieldIndex` , type, name)
}
+ListType__value_field <- function(type){
+ .Call(`_arrow_ListType__value_field` , type)
+}
+
+ListType__value_type <- function(type){
+ .Call(`_arrow_ListType__value_type` , type)
+}
+
ipc___feather___TableWriter__SetDescription <- function(writer, description){
invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description))
}
diff --git a/r/src/array.cpp b/r/src/array.cpp
index 35da4b1..7e4fa6f 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -140,4 +140,35 @@ arrow::ArrayVector StructArray__Flatten(
return out;
}
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ListArray__value_type(
+ const std::shared_ptr<arrow::ListArray>& array) {
+ return array->value_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ListArray__values(
+ const std::shared_ptr<arrow::ListArray>& array) {
+ return array->values();
+}
+
+// [[arrow::export]]
+int32_t ListArray__value_length(const std::shared_ptr<arrow::ListArray>& array,
+ int64_t i) {
+ return array->value_length(i);
+}
+
+// [[arrow::export]]
+int32_t ListArray__value_offset(const std::shared_ptr<arrow::ListArray>& array,
+ int64_t i) {
+ return array->value_offset(i);
+}
+
+// [[arrow::export]]
+Rcpp::IntegerVector ListArray__raw_value_offsets(
+ const std::shared_ptr<arrow::ListArray>& array) {
+ auto offsets = array->raw_value_offsets();
+ return Rcpp::IntegerVector(offsets, offsets + array->length());
+}
+
#endif
diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp
index 4e26f8d..7fcb02b 100644
--- a/r/src/array__to_vector.cpp
+++ b/r/src/array__to_vector.cpp
@@ -547,6 +547,47 @@ class Converter_Decimal : public Converter {
}
};
+class Converter_List : public Converter {
+ public:
+ explicit Converter_List(const ArrayVector& arrays) : Converter(arrays) {}
+
+ SEXP Allocate(R_xlen_t n) const { return Rcpp::List(no_init(n)); }
+
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ // nothing to do, list contain NULL by default
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n) const {
+ using internal::checked_cast;
+ auto list_array = checked_cast<arrow::ListArray*>(array.get());
+ auto values_array = list_array->values();
+
+ auto ingest_one = [&](R_xlen_t i) {
+ auto slice =
+ values_array->Slice(list_array->value_offset(i), list_array->value_length(i));
+ SET_VECTOR_ELT(data, i + start, Array__as_vector(slice));
+ };
+
+ if (array->null_count()) {
+ internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(),
+ n);
+
+ for (R_xlen_t i = 0; i < n; i++, bitmap_reader.Next()) {
+ if (bitmap_reader.IsSet()) ingest_one(i);
+ }
+
+ } else {
+ for (R_xlen_t i = 0; i < n; i++) {
+ ingest_one(i);
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
class Converter_Int64 : public Converter {
public:
explicit Converter_Int64(const ArrayVector& arrays) : Converter(arrays) {}
@@ -658,9 +699,13 @@ std::shared_ptr<Converter> Converter::Make(const ArrayVector& arrays) {
case Type::DECIMAL:
return std::make_shared<arrow::r::Converter_Decimal>(arrays);
+ // nested
case Type::STRUCT:
return std::make_shared<arrow::r::Converter_Struct>(arrays);
+ case Type::LIST:
+ return std::make_shared<arrow::r::Converter_List>(arrays);
+
default:
break;
}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 8d5ed3e..31e27ea 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -317,6 +317,83 @@ RcppExport SEXP _arrow_StructArray__Flatten(SEXP array_sexp){
}
#endif
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ListArray__value_type(const std::shared_ptr<arrow::ListArray>& array);
+RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ return Rcpp::wrap(ListArray__value_type(array));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__value_type(SEXP array_sexp){
+ Rf_error("Cannot call ListArray__value_type(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ListArray__values(const std::shared_ptr<arrow::ListArray>& array);
+RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ return Rcpp::wrap(ListArray__values(array));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__values(SEXP array_sexp){
+ Rf_error("Cannot call ListArray__values(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t ListArray__value_length(const std::shared_ptr<arrow::ListArray>& array, int64_t i);
+RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ Rcpp::traits::input_parameter<int64_t>::type i(i_sexp);
+ return Rcpp::wrap(ListArray__value_length(array, i));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__value_length(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call ListArray__value_length(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+int32_t ListArray__value_offset(const std::shared_ptr<arrow::ListArray>& array, int64_t i);
+RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ Rcpp::traits::input_parameter<int64_t>::type i(i_sexp);
+ return Rcpp::wrap(ListArray__value_offset(array, i));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__value_offset(SEXP array_sexp, SEXP i_sexp){
+ Rf_error("Cannot call ListArray__value_offset(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// array.cpp
+#if defined(ARROW_R_WITH_ARROW)
+Rcpp::IntegerVector ListArray__raw_value_offsets(const std::shared_ptr<arrow::ListArray>& array);
+RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListArray>&>::type array(array_sexp);
+ return Rcpp::wrap(ListArray__raw_value_offsets(array));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListArray__raw_value_offsets(SEXP array_sexp){
+ Rf_error("Cannot call ListArray__raw_value_offsets(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
// array__to_vector.cpp
#if defined(ARROW_R_WITH_ARROW)
SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array);
@@ -1743,6 +1820,36 @@ RcppExport SEXP _arrow_StructType__GetFieldIndex(SEXP type_sexp, SEXP name_sexp)
}
#endif
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Field> ListType__value_field(const std::shared_ptr<arrow::ListType>& type);
+RcppExport SEXP _arrow_ListType__value_field(SEXP type_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+ return Rcpp::wrap(ListType__value_field(type));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListType__value_field(SEXP type_sexp){
+ Rf_error("Cannot call ListType__value_field(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
+// datatype.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ListType__value_type(const std::shared_ptr<arrow::ListType>& type);
+RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::ListType>&>::type type(type_sexp);
+ return Rcpp::wrap(ListType__value_type(type));
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_ListType__value_type(SEXP type_sexp){
+ Rf_error("Cannot call ListType__value_type(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
// feather.cpp
#if defined(ARROW_R_WITH_ARROW)
void ipc___feather___TableWriter__SetDescription(const std::unique_ptr<arrow::ipc::feather::TableWriter>& writer, const std::string& description);
@@ -3388,6 +3495,11 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_StructArray__field", (DL_FUNC) &_arrow_StructArray__field, 2},
{ "_arrow_StructArray__GetFieldByName", (DL_FUNC) &_arrow_StructArray__GetFieldByName, 2},
{ "_arrow_StructArray__Flatten", (DL_FUNC) &_arrow_StructArray__Flatten, 1},
+ { "_arrow_ListArray__value_type", (DL_FUNC) &_arrow_ListArray__value_type, 1},
+ { "_arrow_ListArray__values", (DL_FUNC) &_arrow_ListArray__values, 1},
+ { "_arrow_ListArray__value_length", (DL_FUNC) &_arrow_ListArray__value_length, 2},
+ { "_arrow_ListArray__value_offset", (DL_FUNC) &_arrow_ListArray__value_offset, 2},
+ { "_arrow_ListArray__raw_value_offsets", (DL_FUNC) &_arrow_ListArray__raw_value_offsets, 1},
{ "_arrow_Array__as_vector", (DL_FUNC) &_arrow_Array__as_vector, 1},
{ "_arrow_ChunkedArray__as_vector", (DL_FUNC) &_arrow_ChunkedArray__as_vector, 1},
{ "_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 2},
@@ -3482,6 +3594,8 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1},
{ "_arrow_StructType__GetFieldByName", (DL_FUNC) &_arrow_StructType__GetFieldByName, 2},
{ "_arrow_StructType__GetFieldIndex", (DL_FUNC) &_arrow_StructType__GetFieldIndex, 2},
+ { "_arrow_ListType__value_field", (DL_FUNC) &_arrow_ListType__value_field, 1},
+ { "_arrow_ListType__value_type", (DL_FUNC) &_arrow_ListType__value_type, 1},
{ "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2},
{ "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2},
{ "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3},
diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp
index 18920f2..f4a4b09 100644
--- a/r/src/datatype.cpp
+++ b/r/src/datatype.cpp
@@ -281,4 +281,16 @@ int StructType__GetFieldIndex(const std::shared_ptr<arrow::StructType>& type,
return type->GetFieldIndex(name);
}
+// [[arrow::export]]
+std::shared_ptr<arrow::Field> ListType__value_field(
+ const std::shared_ptr<arrow::ListType>& type) {
+ return type->value_field();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ListType__value_type(
+ const std::shared_ptr<arrow::ListType>& type) {
+ return type->value_type();
+}
+
#endif
diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R
index 6f77b3b..dfc0d53 100644
--- a/r/tests/testthat/test-DataType.R
+++ b/r/tests/testthat/test-DataType.R
@@ -297,6 +297,8 @@ test_that("list type works as expected", {
x$children(),
list(field("item", int32()))
)
+ expect_equal(x$value_type, int32())
+ expect_equal(x$value_field, field("item", int32()))
})
test_that("struct type works as expected", {
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
index 38b20a8..206dfdd 100644
--- a/r/tests/testthat/test-json.R
+++ b/r/tests/testthat/test-json.R
@@ -75,12 +75,12 @@ test_that("read_json_arrow() converts to tibble", {
test_that("Can read json file with nested columns (ARROW-5503)", {
tf <- tempfile()
writeLines('
- { "nuf": {} }
- { "nuf": null }
- { "nuf": { "ps": 78.0, "hello": "hi" } }
- { "nuf": { "ps": 90.0, "hello": "bonjour" } }
- { "nuf": { "hello": "ciao" } }
- { "nuf": { "ps": 19 } }
+ { "arr": [1.0, 2.0, 3.0], "nuf": {} }
+ { "arr": [2.0], "nuf": null }
+ { "arr": [], "nuf": { "ps": 78.0, "hello": "hi" } }
+ { "arr": null, "nuf": { "ps": 90.0, "hello": "bonjour" } }
+ { "arr": [5.0], "nuf": { "hello": "ciao" } }
+ { "arr": [5.0, 6.0], "nuf": { "ps": 19 } }
', tf)
tab1 <- read_json_arrow(tf, as_tibble = FALSE)
@@ -93,11 +93,12 @@ test_that("Can read json file with nested columns (ARROW-5503)", {
expect_equal(
tab1$schema,
schema(
+ arr = list_of(float64()),
nuf = struct(ps = float64(), hello = utf8())
)
)
- struct_array <- tab1$column(0)$data()$chunk(0)
+ struct_array <- tab1$column(1)$data()$chunk(0)
ps <- array(c(NA, NA, 78, 90, NA, 19))
hello <- array(c(NA, NA, "hi", "bonjour", "ciao", NA))
expect_equal(struct_array$field(0L), ps)
@@ -108,8 +109,28 @@ test_that("Can read json file with nested columns (ARROW-5503)", {
data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE)
)
- # cannot yet test list and struct types in R api
- # tib <- as.data.frame(tab1)
+ list_array_r <- list(
+ c(1, 2, 3),
+ c(2),
+ numeric(),
+ NULL,
+ 5,
+ c(5, 6)
+ )
+ list_array <- tab1$column(0)$data()
+ expect_identical(
+ list_array$as_vector(),
+ list_array_r
+ )
+
+ tib <- as.data.frame(tab1)
+ expect_identical(
+ tib,
+ tibble::tibble(
+ arr = list_array_r,
+ nuf = data.frame(ps = ps$as_vector(), hello = hello$as_vector(), stringsAsFactors = FALSE)
+ )
+ )
unlink(tf)
})