You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/28 08:27:29 UTC

[arrow] branch master updated: ARROW-3341: [R] Support for logical vector

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3c24f7a  ARROW-3341: [R] Support for logical vector
3c24f7a is described below

commit 3c24f7a19c9ee330477ce8b15e6756134afb925e
Author: Romain Francois <ro...@purrple.cat>
AuthorDate: Fri Sep 28 04:27:20 2018 -0400

    ARROW-3341: [R] Support for logical vector
    
    As opposed to the other arrays, this needs special handling of buffers because R stores logical vectors as `int`.
    
    Author: Romain Francois <ro...@purrple.cat>
    
    Closes #2652 from romainfrancois/feature/ARROW-3341-LGLSXP and squashes the following commits:
    
    78da8935d <Romain Francois> lint
    4c4c0a414 <Romain Francois> install clang tools on the R build to run travis_lint
    9d4b432af <Romain Francois> logical vector <-> array and chunked array
---
 .travis.yml                          |   4 +-
 r/.Rbuildignore                      |   2 +
 r/src/array.cpp                      | 104 +++++++++++++++++++++++++++++++++--
 r/tests/testthat/test-Array.R        |  13 +++++
 r/tests/testthat/test-RecordBatch.R  |  18 ++++--
 r/tests/testthat/test-chunkedarray.R |  25 +++++++++
 6 files changed, 154 insertions(+), 12 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 10191de..79c6add 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -261,10 +261,10 @@ matrix:
         fi
     - $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh
     - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh --only-library
-    - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib
-    - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib/pkgconfig
     - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh
     - $TRAVIS_BUILD_DIR/ci/travis_lint.sh
+    - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib
+    - export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TRAVIS_BUILD_DIR/cpp-install/lib/pkgconfig
     - pushd ${TRAVIS_BUILD_DIR}/r
 
 
diff --git a/r/.Rbuildignore b/r/.Rbuildignore
index 89f34ae..b0c42b5 100644
--- a/r/.Rbuildignore
+++ b/r/.Rbuildignore
@@ -4,3 +4,5 @@
 src/.clang-format
 LICENSE.md
 ^data-raw$
+lint.sh
+
diff --git a/r/src/array.cpp b/r/src/array.cpp
index 02c52aa..0de2d80 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -36,7 +36,7 @@ class SimpleRBuffer : public arrow::Buffer {
   Vec vec_;
 };
 
-template <int RTYPE, typename Type>
+template <int RTYPE, typename Type, typename ArrayType>
 std::shared_ptr<arrow::Array> SimpleArray(SEXP x) {
   Rcpp::Vector<RTYPE> vec(x);
   std::vector<std::shared_ptr<arrow::Buffer>> buffers{
@@ -80,7 +80,69 @@ std::shared_ptr<arrow::Array> SimpleArray(SEXP x) {
   );
 
   // return the right Array class
-  return std::make_shared<arrow::NumericArray<Type>>(data);
+  return std::make_shared<ArrayType>(data);
+}
+
+std::shared_ptr<arrow::Array> MakeBooleanArray(
+    Rcpp::Vector<LGLSXP, Rcpp::NoProtectStorage> vec) {
+  R_xlen_t n = vec.size();
+
+  // allocate a buffer for the data
+  std::shared_ptr<arrow::Buffer> data_bitmap;
+  R_ERROR_NOT_OK(arrow::AllocateBuffer(ceil(n / 8), &data_bitmap));
+  auto data_bitmap_data = data_bitmap->mutable_data();
+  arrow::internal::FirstTimeBitmapWriter bitmap_writer(data_bitmap_data, 0, n);
+  R_xlen_t null_count = 0;
+
+  // loop until the first no null
+  R_xlen_t i = 0;
+  for (; i < n; i++, bitmap_writer.Next()) {
+    if (vec[i] == 0) {
+      bitmap_writer.Clear();
+    } else if (vec[i] == NA_LOGICAL) {
+      break;
+    } else {
+      bitmap_writer.Set();
+    }
+  }
+
+  std::shared_ptr<arrow::Buffer> null_bitmap(nullptr);
+  if (i < n) {
+    // there has been a null before the end, so we need
+    // to collect that information in a null bitmap
+    R_ERROR_NOT_OK(arrow::AllocateBuffer(ceil(n / 8), &null_bitmap));
+    auto null_bitmap_data = null_bitmap->mutable_data();
+    arrow::internal::FirstTimeBitmapWriter null_bitmap_writer(null_bitmap_data, 0, n);
+
+    // catch up on the initial `i` bits
+    for (R_xlen_t j = 0; j < i; j++, null_bitmap_writer.Next()) {
+      null_bitmap_writer.Set();
+    }
+
+    // finish both bitmaps
+    for (; i < n; i++, bitmap_writer.Next(), null_bitmap_writer.Next()) {
+      if (vec[i] == 0) {
+        bitmap_writer.Clear();
+        null_bitmap_writer.Set();
+      } else if (vec[i] == NA_LOGICAL) {
+        null_bitmap_writer.Clear();
+        null_count++;
+      } else {
+        bitmap_writer.Set();
+        null_bitmap_writer.Set();
+      }
+    }
+    null_bitmap_writer.Finish();
+  }
+  bitmap_writer.Finish();
+
+  auto data = ArrayData::Make(std::make_shared<BooleanType>(), n,
+                              {std::move(null_bitmap), std::move(data_bitmap)},
+                              null_count, 0 /*offset*/
+  );
+
+  // return the right Array class
+  return std::make_shared<BooleanArray>(data);
 }
 
 }  // namespace r
@@ -89,16 +151,21 @@ std::shared_ptr<arrow::Array> SimpleArray(SEXP x) {
 // [[Rcpp::export]]
 std::shared_ptr<arrow::Array> Array__from_vector(SEXP x) {
   switch (TYPEOF(x)) {
+    case LGLSXP:
+      return arrow::r::MakeBooleanArray(x);
     case INTSXP:
       if (Rf_isFactor(x)) {
         break;
       }
-      return arrow::r::SimpleArray<INTSXP, arrow::Int32Type>(x);
+      return arrow::r::SimpleArray<INTSXP, arrow::Int32Type,
+                                   arrow::NumericArray<arrow::Int32Type>>(x);
     case REALSXP:
       // TODO: Dates, ...
-      return arrow::r::SimpleArray<REALSXP, arrow::DoubleType>(x);
+      return arrow::r::SimpleArray<REALSXP, arrow::DoubleType,
+                                   arrow::NumericArray<arrow::DoubleType>>(x);
     case RAWSXP:
-      return arrow::r::SimpleArray<RAWSXP, arrow::Int8Type>(x);
+      return arrow::r::SimpleArray<RAWSXP, arrow::Int8Type,
+                                   arrow::NumericArray<arrow::Int8Type>>(x);
     default:
       break;
   }
@@ -131,9 +198,36 @@ inline SEXP simple_Array_to_Vector(const std::shared_ptr<arrow::Array>& array) {
   return vec;
 }
 
+inline SEXP BooleanArray_to_Vector(const std::shared_ptr<arrow::Array>& array) {
+  size_t n = array->length();
+  LogicalVector vec(n);
+
+  // process the data
+  arrow::internal::BitmapReader data_reader(array->data()->buffers[1]->data(),
+                                            array->offset(), n);
+  for (size_t i = 0; i < n; i++, data_reader.Next()) {
+    vec[i] = data_reader.IsSet();
+  }
+
+  // then the null bitmap if needed
+  if (array->null_count()) {
+    arrow::internal::BitmapReader null_reader(array->null_bitmap()->data(),
+                                              array->offset(), n);
+    for (size_t i = 0; i < n; i++, null_reader.Next()) {
+      if (null_reader.IsNotSet()) {
+        vec[i] = LogicalVector::get_na();
+      }
+    }
+  }
+
+  return vec;
+}
+
 // [[Rcpp::export]]
 SEXP Array__as_vector(const std::shared_ptr<arrow::Array>& array) {
   switch (array->type_id()) {
+    case Type::BOOL:
+      return BooleanArray_to_Vector(array);
     case Type::INT8:
       return simple_Array_to_Vector<RAWSXP>(array);
     case Type::INT32:
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index f747cae..78b641a 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -64,3 +64,16 @@ test_that("Array supports NA", {
   expect_equal(Array__Mask(x_int), c(rep(TRUE, 10), FALSE))
   expect_equal(Array__Mask(x_dbl), c(rep(TRUE, 10), FALSE))
 })
+
+test_that("Array supports logical vectors (ARROW-3341)", {
+  # with NA
+  x <- sample(c(TRUE, FALSE, NA), 1000, replace = TRUE)
+  arr_lgl <- array(x)
+  expect_identical(x, arr_lgl$as_vector())
+
+  # without NA
+  x <- sample(c(TRUE, FALSE), 1000, replace = TRUE)
+  arr_lgl <- array(x)
+  expect_identical(x, arr_lgl$as_vector())
+})
+
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index 71d645c..67c010c 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -18,19 +18,22 @@
 context("arrow::RecordBatch")
 
 test_that("RecordBatch", {
-  tbl <- tibble::tibble(int = 1:10, dbl = as.numeric(1:10))
+  tbl <- tibble::tibble(
+    int = 1:10, dbl = as.numeric(1:10),
+    lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE)
+  )
   batch <- record_batch(tbl)
 
   expect_true(batch == batch)
   expect_equal(
     batch$schema(),
-    schema(int = int32(), dbl = float64())
+    schema(int = int32(), dbl = float64(), lgl = boolean())
   )
-  expect_equal(batch$num_columns(), 2L)
+  expect_equal(batch$num_columns(), 3L)
   expect_equal(batch$num_rows(), 10L)
   expect_equal(batch$column_name(0), "int")
   expect_equal(batch$column_name(1), "dbl")
-  expect_equal(names(batch), c("int", "dbl"))
+  expect_equal(names(batch), c("int", "dbl", "lgl"))
 
   col_int <- batch$column(0)
   expect_true(inherits(col_int, 'arrow::Array'))
@@ -42,10 +45,15 @@ test_that("RecordBatch", {
   expect_equal(col_dbl$as_vector(), tbl$dbl)
   expect_equal(col_dbl$type(), float64())
 
+  col_lgl <- batch$column(2)
+  expect_true(inherits(col_dbl, 'arrow::Array'))
+  expect_equal(col_lgl$as_vector(), tbl$lgl)
+  expect_equal(col_lgl$type(), boolean())
+
   batch2 <- batch$RemoveColumn(0)
   expect_equal(
     batch2$schema(),
-    schema(dbl = float64())
+    schema(dbl = float64(), lgl = boolean())
   )
   expect_equal(batch2$column(0), batch$column(1))
 
diff --git a/r/tests/testthat/test-chunkedarray.R b/r/tests/testthat/test-chunkedarray.R
index 09b062f..0addfb2 100644
--- a/r/tests/testthat/test-chunkedarray.R
+++ b/r/tests/testthat/test-chunkedarray.R
@@ -75,3 +75,28 @@ test_that("ChunkedArray handles NA", {
   expect_equal(Array__Mask(chunks[[2]]), !is.na(data[[2]]))
   expect_equal(Array__Mask(chunks[[3]]), !is.na(data[[3]]))
 })
+
+test_that("ChunkedArray supports logical vectors (ARROW-3341)", {
+  # with NA
+  data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE))
+  arr_lgl <- chunked_array(!!!data)
+  expect_equal(arr_lgl$length(), 300L)
+  expect_equal(arr_lgl$null_count(), sum(unlist(map(data, is.na))))
+
+  chunks <- arr_lgl$chunks()
+  expect_identical(data[[1]], chunks[[1]]$as_vector())
+  expect_identical(data[[2]], chunks[[2]]$as_vector())
+  expect_identical(data[[3]], chunks[[3]]$as_vector())
+
+  # without NA
+  data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE))
+  arr_lgl <- chunked_array(!!!data)
+  expect_equal(arr_lgl$length(), 300L)
+  expect_equal(arr_lgl$null_count(), sum(unlist(map(data, is.na))))
+
+  chunks <- arr_lgl$chunks()
+  expect_identical(data[[1]], chunks[[1]]$as_vector())
+  expect_identical(data[[2]], chunks[[2]]$as_vector())
+  expect_identical(data[[3]], chunks[[3]]$as_vector())
+})
+