You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/28 08:27:29 UTC
[arrow] branch master updated: ARROW-3341: [R] Support for logical
vector
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3c24f7a ARROW-3341: [R] Support for logical vector
3c24f7a is described below
commit 3c24f7a19c9ee330477ce8b15e6756134afb925e
Author: Romain Francois <ro...@purrple.cat>
AuthorDate: Fri Sep 28 04:27:20 2018 -0400
ARROW-3341: [R] Support for logical vector
As opposed to the other arrays, this needs special handling of buffers because R stores logical vectors as `int`.
Author: Romain Francois <ro...@purrple.cat>
Closes #2652 from romainfrancois/feature/ARROW-3341-LGLSXP and squashes the following commits:
78da8935d <Romain Francois> lint
4c4c0a414 <Romain Francois> install clang tools on the R build to run travis_lint
9d4b432af <Romain Francois> logical vector <-> array and chunked array
---
.travis.yml | 4 +-
r/.Rbuildignore | 2 +
r/src/array.cpp | 104 +++++++++++++++++++++++++++++++++--
r/tests/testthat/test-Array.R | 13 +++++
r/tests/testthat/test-RecordBatch.R | 18 ++++--
r/tests/testthat/test-chunkedarray.R | 25 +++++++++
6 files changed, 154 insertions(+), 12 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 10191de..79c6add 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -261,10 +261,10 @@ matrix:
fi
- $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh
- $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library
- - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib
- - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib/pkgconfig
- $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh
- $TRAVIS_BUILD_DIR/ci/travis_lint.sh
+ - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib
+ - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib/pkgconfig
- pushd ${TRAVIS_BUILD_DIR}/r
diff --git a/r/.Rbuildignore b/r/.Rbuildignore
index 89f34ae..b0c42b5 100644
--- a/r/.Rbuildignore
+++ b/r/.Rbuildignore
@@ -4,3 +4,5 @@
src/.clang-format
LICENSE.md
^data-raw$
+lint.sh
+
diff --git a/r/src/array.cpp b/r/src/array.cpp
index 02c52aa..0de2d80 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -36,7 +36,7 @@ class SimpleRBuffer : public arrow::Buffer {
Vec vec_;
};
-template <int RTYPE, typename Type>
+template <int RTYPE, typename Type, typename ArrayType>
std::shared_ptr<arrow::Array> SimpleArray(SEXP x) {
Rcpp::Vector<RTYPE> vec(x);
std::vector<std::shared_ptr<arrow::Buffer>> buffers{
@@ -80,7 +80,69 @@ std::shared_ptr<arrow::Array> SimpleArray(SEXP x) {
);
// return the right Array class
- return std::make_shared<arrow::NumericArray<Type>>(data);
+ return std::make_shared<ArrayType>(data);
+}
+
+std::shared_ptr<arrow::Array> MakeBooleanArray(
+ Rcpp::Vector<LGLSXP, Rcpp::NoProtectStorage> vec) {
+ R_xlen_t n = vec.size();
+
+ // allocate a buffer for the data
+ std::shared_ptr<arrow::Buffer> data_bitmap;
+ R_ERROR_NOT_OK(arrow::AllocateBuffer(ceil(n / 8), &data_bitmap));
+ auto data_bitmap_data = data_bitmap->mutable_data();
+ arrow::internal::FirstTimeBitmapWriter bitmap_writer(data_bitmap_data, 0, n);
+ R_xlen_t null_count = 0;
+
+ // loop until the first no null
+ R_xlen_t i = 0;
+ for (; i < n; i++, bitmap_writer.Next()) {
+ if (vec[i] == 0) {
+ bitmap_writer.Clear();
+ } else if (vec[i] == NA_LOGICAL) {
+ break;
+ } else {
+ bitmap_writer.Set();
+ }
+ }
+
+ std::shared_ptr<arrow::Buffer> null_bitmap(nullptr);
+ if (i < n) {
+ // there has been a null before the end, so we need
+ // to collect that information in a null bitmap
+ R_ERROR_NOT_OK(arrow::AllocateBuffer(ceil(n / 8), &null_bitmap));
+ auto null_bitmap_data = null_bitmap->mutable_data();
+ arrow::internal::FirstTimeBitmapWriter null_bitmap_writer(null_bitmap_data, 0, n);
+
+ // catch up on the initial `i` bits
+ for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) {
+ null_bitmap_writer.Set();
+ }
+
+ // finish both bitmaps
+ for (; i < n; i++, bitmap_writer.Next(), null_bitmap_writer.Next()) {
+ if (vec[i] == 0) {
+ bitmap_writer.Clear();
+ null_bitmap_writer.Set();
+ } else if (vec[i] == NA_LOGICAL) {
+ null_bitmap_writer.Clear();
+ null_count++;
+ } else {
+ bitmap_writer.Set();
+ null_bitmap_writer.Set();
+ }
+ }
+ null_bitmap_writer.Finish();
+ }
+ bitmap_writer.Finish();
+
+ auto data = ArrayData::Make(std::make_shared<BooleanType>(), n,
+ {std::move(null_bitmap), std::move(data_bitmap)},
+ null_count, 0 /*offset*/
+ );
+
+ // return the right Array class
+ return std::make_shared<BooleanArray>(data);
}
} // namespace r
@@ -89,16 +151,21 @@ std::shared_ptr<arrow::Array> SimpleArray(SEXP x) {
// [[Rcpp::export]]
std::shared_ptr<arrow::Array> Array__from_vector(SEXP x) {
switch (TYPEOF(x)) {
+ case LGLSXP:
+ return arrow::r::MakeBooleanArray(x);
case INTSXP:
if (Rf_isFactor(x)) {
break;
}
- return arrow::r::SimpleArray<INTSXP, arrow::Int32Type>(x);
+ return arrow::r::SimpleArray<INTSXP, arrow::Int32Type,
+ arrow::NumericArray<arrow::Int32Type>>(x);
case REALSXP:
// TODO: Dates, ...
- return arrow::r::SimpleArray<REALSXP, arrow::DoubleType>(x);
+ return arrow::r::SimpleArray<REALSXP, arrow::DoubleType,
+ arrow::NumericArray<arrow::DoubleType>>(x);
case RAWSXP:
- return arrow::r::SimpleArray<RAWSXP, arrow::Int8Type>(x);
+ return arrow::r::SimpleArray<RAWSXP, arrow::Int8Type,
+ arrow::NumericArray<arrow::Int8Type>>(x);
default:
break;
}
@@ -131,9 +198,36 @@ inline SEXP simple_Array_to_Vector(const std::shared_ptr<arrow::Array>& array) {
return vec;
}
+inline SEXP BooleanArray_to_Vector(const std::shared_ptr<arrow::Array>& array) {
+ size_t n = array->length();
+ LogicalVector vec(n);
+
+ // process the data
+ arrow::internal::BitmapReader data_reader(array->data()->buffers[1]->data(),
+ array->offset(), n);
+ for (size_t i = 0; i < n; i++, data_reader.Next()) {
+ vec[i] = data_reader.IsSet();
+ }
+
+ // then the null bitmap if needed
+ if (array->null_count()) {
+ arrow::internal::BitmapReader null_reader(array->null_bitmap()->data(),
+ array->offset(), n);
+ for (size_t i = 0; i < n; i++, null_reader.Next()) {
+ if (null_reader.IsNotSet()) {
+ vec[i] = LogicalVector::get_na();
+ }
+ }
+ }
+
+ return vec;
+}
+
// [[Rcpp::export]]
SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array) {
switch (array->type_id()) {
+ case Type::BOOL:
+ return BooleanArray_to_Vector(array);
case Type::INT8:
return simple_Array_to_Vector<RAWSXP>(array);
case Type::INT32:
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index f747cae..78b641a 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -64,3 +64,16 @@ test_that("Array supports NA", {
expect_equal(Array__Mask(x_int), c(rep(TRUE, 10), FALSE))
expect_equal(Array__Mask(x_dbl), c(rep(TRUE, 10), FALSE))
})
+
+test_that("Array supports logical vectors (ARROW-3341)", {
+ # with NA
+ x <- sample(c(TRUE, FALSE, NA), 1000, replace = TRUE)
+ arr_lgl <- array(x)
+ expect_identical(x, arr_lgl$as_vector())
+
+ # without NA
+ x <- sample(c(TRUE, FALSE), 1000, replace = TRUE)
+ arr_lgl <- array(x)
+ expect_identical(x, arr_lgl$as_vector())
+})
+
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index 71d645c..67c010c 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -18,19 +18,22 @@
context("arrow::RecordBatch")
test_that("RecordBatch", {
- tbl <- tibble::tibble(int = 1:10, dbl = as.numeric(1:10))
+ tbl <- tibble::tibble(
+ int = 1:10, dbl = as.numeric(1:10),
+ lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE)
+ )
batch <- record_batch(tbl)
expect_true(batch == batch)
expect_equal(
batch$schema(),
- schema(int = int32(), dbl = float64())
+ schema(int = int32(), dbl = float64(), lgl = boolean())
)
- expect_equal(batch$num_columns(), 2L)
+ expect_equal(batch$num_columns(), 3L)
expect_equal(batch$num_rows(), 10L)
expect_equal(batch$column_name(0), "int")
expect_equal(batch$column_name(1), "dbl")
- expect_equal(names(batch), c("int", "dbl"))
+ expect_equal(names(batch), c("int", "dbl", "lgl"))
col_int <- batch$column(0)
expect_true(inherits(col_int, 'arrow::Array'))
@@ -42,10 +45,15 @@ test_that("RecordBatch", {
expect_equal(col_dbl$as_vector(), tbl$dbl)
expect_equal(col_dbl$type(), float64())
+ col_lgl <- batch$column(2)
+ expect_true(inherits(col_dbl, 'arrow::Array'))
+ expect_equal(col_lgl$as_vector(), tbl$lgl)
+ expect_equal(col_lgl$type(), boolean())
+
batch2 <- batch$RemoveColumn(0)
expect_equal(
batch2$schema(),
- schema(dbl = float64())
+ schema(dbl = float64(), lgl = boolean())
)
expect_equal(batch2$column(0), batch$column(1))
diff --git a/r/tests/testthat/test-chunkedarray.R b/r/tests/testthat/test-chunkedarray.R
index 09b062f..0addfb2 100644
--- a/r/tests/testthat/test-chunkedarray.R
+++ b/r/tests/testthat/test-chunkedarray.R
@@ -75,3 +75,28 @@ test_that("ChunkedArray handles NA", {
expect_equal(Array__Mask(chunks[[2]]), !is.na(data[[2]]))
expect_equal(Array__Mask(chunks[[3]]), !is.na(data[[3]]))
})
+
+test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
+ # with NA
+ data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
+ arr_lgl <- chunked_array(!!!data)
+ expect_equal(arr_lgl$length(), 300L)
+ expect_equal(arr_lgl$null_count(), sum(unlist(map(data, is.na))))
+
+ chunks <- arr_lgl$chunks()
+ expect_identical(data[[1]], chunks[[1]]$as_vector())
+ expect_identical(data[[2]], chunks[[2]]$as_vector())
+ expect_identical(data[[3]], chunks[[3]]$as_vector())
+
+ # without NA
+ data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
+ arr_lgl <- chunked_array(!!!data)
+ expect_equal(arr_lgl$length(), 300L)
+ expect_equal(arr_lgl$null_count(), sum(unlist(map(data, is.na))))
+
+ chunks <- arr_lgl$chunks()
+ expect_identical(data[[1]], chunks[[1]]$as_vector())
+ expect_identical(data[[2]], chunks[[2]]$as_vector())
+ expect_identical(data[[3]], chunks[[3]]$as_vector())
+})
+