You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/19 12:37:09 UTC

[GitHub] [arrow] nealrichardson commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

nealrichardson commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672252916



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       I thought arrays were immutable? when/how likely is this to be true?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Should this also assert altrep-ness, maybe something like this?
   
   ```suggestion
       alt <- Array$create(x)$as_vector()
       expect_true(is_altrep(alt))
       result <- fn(alt)
       # Calling fn() didn't cause the vector to lose its altrep-ness
       expect_true(is_altrep(alt))
       expect_identical(result, fn(x, ...), ...))
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       `INT32_MIN` is `NA_integer_` right? So it needs to be out of bounds too. Or does this constant come from R and already have that factored in?
   
   ```suggestion
         if (value <= INT32_MIN || value > INT32_MAX) {
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       I would have expected that, instead of altering the Arrow data, we would have `_ELT` and `_GET_REGION` methods that would insert the sentinels in the R vector when the data is accessed. What are the tradeoffs of that way vs. this way?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org