You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/21 16:55:17 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

pitrou commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r674161781



##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,121 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+expect_altrep_rountrip <- function(x, fn, ...) {
+  alt <- Array$create(x)$as_vector()
+
+  expect_true(is_altrep(alt))
+  expect_identical(fn(x, ...), fn(alt, ...))
+  expect_true(is_altrep(alt))
+}
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  x <- c(1, 2, 3)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- c(1, 2, NA_real_)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- rep(NA_real_, 3)
+  expect_warning(expect_altrep_rountrip(x, min, na.rm = TRUE))
+  expect_warning(expect_altrep_rountrip(x, max, na.rm = TRUE))

Review comment:
       Is there a way to test the precise warning that is being emitted here? (or at least a substring thereof)

##########
File path: r/src/altrep.cpp
##########
@@ -19,9 +19,14 @@
 
 #if defined(ARROW_R_WITH_ARROW)
 
+#include <arrow/compute/api.h>

Review comment:
       Would be nice if you could cluster all Arrow includes together.

##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       Hmm, yes, I guess it _should_ be logicall inconsequential. OTOH, it may come as a surprise if people e.g. blindlessly memory-map a file in read-write mode without imagining that RArrow will modify it.

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> Options(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
 
   static void Init(R_altrep_class_t class_t, DllInfo* dll) {
     // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    R_set_altrep_Length_method(class_t, AltrepVector::Length);
+    R_set_altrep_Inspect_method(class_t, AltrepVector::Inspect);
+    R_set_altrep_Duplicate_method(class_t, AltrepVector::Duplicate);
 
     // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    R_set_altvec_Dataptr_method(class_t, AltrepVector::Dataptr);
+    R_set_altvec_Dataptr_or_null_method(class_t, AltrepVector::Dataptr_or_null);
   }
 };
 
-struct DoubleArrayNoNull {
+struct AltrepVectorDouble {
+  using Base = AltrepVector<REALSXP>;
   static R_altrep_class_t class_t;
 
   static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    class_t = R_make_altreal_class("array_dbl_vector", "arrow", dll);
+    Base::Init(class_t, dll);
+
+    R_set_altreal_No_NA_method(class_t, Base::No_NA);
+
+    R_set_altreal_Sum_method(class_t, Base::Sum);
+    R_set_altreal_Min_method(class_t, Base::Min);
+    R_set_altreal_Max_method(class_t, Base::Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return Base::Make(class_t, array, tasks);
   }
 };
 
-struct Int32ArrayNoNull {
+struct AltrepVectorInt32 {
+  using Base = AltrepVector<INTSXP>;
   static R_altrep_class_t class_t;
 
   static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+    class_t = R_make_altinteger_class("array_int_vector", "arrow", dll);
+    Base::Init(class_t, dll);
+    R_set_altinteger_No_NA_method(class_t, Base::No_NA);
+
+    R_set_altinteger_Sum_method(class_t, Base::Sum);
+    R_set_altinteger_Min_method(class_t, Base::Min);
+    R_set_altinteger_Max_method(class_t, Base::Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return Base::Make(class_t, array, tasks);
   }
 };
 
-R_altrep_class_t Int32ArrayNoNull::class_t;
-R_altrep_class_t DoubleArrayNoNull::class_t;
+R_altrep_class_t AltrepVectorInt32::class_t;
+R_altrep_class_t AltrepVectorDouble::class_t;
 
 void Init_Altrep_classes(DllInfo* dll) {
-  DoubleArrayNoNull::Init(dll);
-  Int32ArrayNoNull::Init(dll);
+  AltrepVectorDouble::Init(dll);
+  AltrepVectorInt32::Init(dll);
 }
 
-SEXP MakeDoubleArrayNoNull(const std::shared_ptr<Array>& array) {
-  return DoubleArrayNoNull::Make(array);
+SEXP MakeAltrepVectorDouble(const std::shared_ptr<Array>& array, RTasks& tasks) {

Review comment:
       Since `tasks` is a mutable argument, can you make it `RTasks* tasks`?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,121 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+expect_altrep_rountrip <- function(x, fn, ...) {
+  alt <- Array$create(x)$as_vector()
+
+  expect_true(is_altrep(alt))
+  expect_identical(fn(x, ...), fn(alt, ...))
+  expect_true(is_altrep(alt))
+}
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  x <- c(1, 2, 3)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- c(1, 2, NA_real_)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- rep(NA_real_, 3)
+  expect_warning(expect_altrep_rountrip(x, min, na.rm = TRUE))
+  expect_warning(expect_altrep_rountrip(x, max, na.rm = TRUE))
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+})
+
+test_that("altrep min/max/sum identical to R versions for int", {
+  x <- c(1L, 2L, 3L)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- c(1L, 2L, NA_integer_)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- rep(NA_integer_, 3)
+  expect_warning(expect_altrep_rountrip(x, min, na.rm = TRUE))
+  expect_warning(expect_altrep_rountrip(x, max, na.rm = TRUE))

Review comment:
       Same here.

##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&

Review comment:
       Why the `use_count` check? Even if the Buffer _object_ is unshared, the underlying _data_ may be shared (for example because a slice was made of a parent Buffer).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org