You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/16 08:52:11 UTC

[GitHub] [arrow] romainfrancois opened a new pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

romainfrancois opened a new pull request #10730:
URL: https://github.com/apache/arrow/pull/10730


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r671149781



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();
+
+template <>
+inline double na_sentinel<double>() {
+  return NA_REAL;
+}
+
+template <>
+inline int na_sentinel<int>() {
+  return NA_INTEGER;
+}
+
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {
+  auto n = array->length();
+  auto null_count = array->null_count();
+  internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), n);
+
+  auto* data = array->data()->GetMutableValues<T>(1);
+
+  for (R_xlen_t i = 0, k = 0; k < null_count; i++, bitmap_reader.Next()) {
+    if (bitmap_reader.IsNotSet()) {
+      k++;
+      data[i] = na_sentinel<T>();
+    }
+  }
+}
+
 template <int sexp_type>
-struct ArrayNoNull {
+struct AltrepVector {
   using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;

Review comment:
       ```suggestion
     using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
   ```
   (Speculative support for logical)

##########
File path: r/src/array_to_vector.cpp
##########
@@ -69,13 +69,25 @@ class Converter {
     // special case when there is only one array
     if (chunked_array_->num_chunks() == 1) {
       const auto& array = chunked_array_->chunk(0);
-      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 &&
-          array->null_count() == 0) {
+      // using altrep if
+      // - the arrow.use_altrep is set to TRUE or unset
+      // - the array has at least one element
+      // - either it has no nulls or the data is mutable
+      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) {
         switch (array->type()->id()) {
           case arrow::Type::DOUBLE:
-            return arrow::r::MakeDoubleArrayNoNull(array);
+            if (array->null_count() == 0 || array->data()->buffers[1]->is_mutable()) {

Review comment:
       It's not safe to do this if anyone else might mutate this buffer. We don't have a completely safe way to check uniqueness, either. As a stop gap:
   ```suggestion
               if (array->null_count() == 0 || array->data().use_count() == 1 &&
                   array->data()->buffers[1].use_count() == 1 && array->data()->buffers[1]->is_mutable()) {
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();
+
+template <>
+inline double na_sentinel<double>() {
+  return NA_REAL;
+}
+
+template <>
+inline int na_sentinel<int>() {
+  return NA_INTEGER;
+}
+
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {
+  auto n = array->length();
+  auto null_count = array->null_count();
+  internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), n);
+
+  auto* data = array->data()->GetMutableValues<T>(1);
+
+  for (R_xlen_t i = 0, k = 0; k < null_count; i++, bitmap_reader.Next()) {
+    if (bitmap_reader.IsNotSet()) {
+      k++;
+      data[i] = na_sentinel<T>();
+    }
+  }
+}
+
 template <int sexp_type>
-struct ArrayNoNull {
+struct AltrepVector {
   using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
+  // altrep object around an Array
   // data1: an external pointer to a shared pointer to the Array
   // data2: not used
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array,
+                   RTasks& tasks) {
     // we don't need the whole r6 object, just an external pointer
-    // that retain the array
+    // that retain the Array
     Pointer xp(new std::shared_ptr<Array>(array));
 
+    // we only get here if the Array data buffer is mutable
+    // UseSentinel() puts the R sentinel where the data is null
+    auto null_count = array->null_count();
+    if (null_count > 0) {
+      tasks.Append(true, [array]() {
+        UseSentinel<data_type>(array);

Review comment:
       In the case of a few long arrays, it'd be more performant to divide this work into L2 sized chunks

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();
+
+template <>
+inline double na_sentinel<double>() {
+  return NA_REAL;
+}
+
+template <>
+inline int na_sentinel<int>() {
+  return NA_INTEGER;
+}
+
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {
+  auto n = array->length();
+  auto null_count = array->null_count();
+  internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), n);
+
+  auto* data = array->data()->GetMutableValues<T>(1);
+
+  for (R_xlen_t i = 0, k = 0; k < null_count; i++, bitmap_reader.Next()) {
+    if (bitmap_reader.IsNotSet()) {
+      k++;
+      data[i] = na_sentinel<T>();
+    }
+  }
+}
+
 template <int sexp_type>
-struct ArrayNoNull {
+struct AltrepVector {
   using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
+  // altrep object around an Array
   // data1: an external pointer to a shared pointer to the Array
   // data2: not used
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array,
+                   RTasks& tasks) {
     // we don't need the whole r6 object, just an external pointer
-    // that retain the array
+    // that retain the Array
     Pointer xp(new std::shared_ptr<Array>(array));
 
+    // we only get here if the Array data buffer is mutable
+    // UseSentinel() puts the R sentinel where the data is null
+    auto null_count = array->null_count();
+    if (null_count > 0) {

Review comment:
       Would it make sense to do this lazily in dataptr/getregion?

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +148,144 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
 
   static void Init(R_altrep_class_t class_t, DllInfo* dll) {
     // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    R_set_altrep_Length_method(class_t, AltrepVector::Length);
+    R_set_altrep_Inspect_method(class_t, AltrepVector::Inspect);
+    R_set_altrep_Duplicate_method(class_t, AltrepVector::Duplicate);
 
     // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    R_set_altvec_Dataptr_method(class_t, AltrepVector::Dataptr);
+    R_set_altvec_Dataptr_or_null_method(class_t, AltrepVector::Dataptr_or_null);
   }
 };
 
-struct DoubleArrayNoNull {
+struct AltrepVectorDouble {
+  using Base = AltrepVector<REALSXP>;
   static R_altrep_class_t class_t;
 
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Base::Get(x);
+
+    if (narm == FALSE && array->null_count()) {
+      return Rf_ScalarReal(NA_REAL);
+    }
+
+    // TODO: instead of returning NULL, use arrow::compute() something
+    //      to calculate the sum of `array`
+    return NULL;
+  }
+
+  static SEXP Min(SEXP x, Rboolean narm) {
+    const auto& array = Base::Get(x);
+
+    if (narm == FALSE && array->null_count()) {
+      return Rf_ScalarReal(NA_REAL);
+    }
+
+    // TODO: instead of returning NULL, use arrow::compute() something
+    //      to calculate the min of `array`
+    return NULL;
+  }
+
+  static SEXP Max(SEXP x, Rboolean narm) {
+    const auto& array = Base::Get(x);
+
+    if (narm == FALSE && array->null_count()) {
+      return Rf_ScalarReal(NA_REAL);
+    }
+
+    // TODO: instead of returning NULL, use arrow::compute() something
+    //      to calculate the max of `array`
+    return NULL;
+  }
+
   static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    class_t = R_make_altreal_class("array_dbl_vector", "arrow", dll);
+    AltrepVector<REALSXP>::Init(class_t, dll);
+
+    R_set_altreal_No_NA_method(class_t, AltrepVector<REALSXP>::No_NA);
+
+    R_set_altreal_Sum_method(class_t, Sum);
+    R_set_altreal_Min_method(class_t, Min);
+    R_set_altreal_Max_method(class_t, Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return AltrepVector<REALSXP>::Make(class_t, array, tasks);
   }
 };
 
-struct Int32ArrayNoNull {
+struct AltrepVectorInt32 {
+  using Base = AltrepVector<INTSXP>;
   static R_altrep_class_t class_t;
 
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Base::Get(x);
+
+    if (narm == FALSE && array->null_count()) {
+      return Rf_ScalarInteger(NA_INTEGER);
+    }
+
+    // TODO: instead of returning NULL, use arrow::compute() something
+    //      to calculate the sum of `array`
+    return NULL;
+  }
+
+  static SEXP Min(SEXP x, Rboolean narm) {
+    const auto& array = Base::Get(x);
+
+    if (narm == FALSE && array->null_count()) {
+      return Rf_ScalarInteger(NA_INTEGER);
+    }
+
+    // TODO: instead of returning NULL, use arrow::compute() something
+    //      to calculate the min of `array`
+    return NULL;
+  }
+
+  static SEXP Max(SEXP x, Rboolean narm) {
+    const auto& array = Base::Get(x);
+
+    if (narm == FALSE && array->null_count()) {
+      return Rf_ScalarInteger(NA_INTEGER);
+    }
+
+    // TODO: instead of returning NULL, use arrow::compute() something
+    //      to calculate the max of `array`
+    return NULL;
+  }
   static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+    class_t = R_make_altinteger_class("array_int_vector", "arrow", dll);
+    AltrepVector<INTSXP>::Init(class_t, dll);

Review comment:
       Please remove Base or use it consistently

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();

Review comment:
       Why can't we use `cpp11::na`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-900161009


   Thanks for the reviews. I'll apply the relevant changes. 
   
   We now have `AltrepArrayNoNulls<INTSXP|REALSXP>` and `AltrepArrayWithNulls<INTSXP|REALSXP>` which are not all that different. I believe they can be merged into one template, which would perhaps slightly more involved `Dataptr()` ... methods... 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-901905934


   I finally settled for a single altrep (template) class for both cases (the array has no nulls or the array has nulls). 
   
   The altrep object uses: 
    - data1: an external pointer to the Array (well to a shared pointer to the Array) 
    - data2: NULL until materialization is needed, then data2 is a standard (and immutable) R vector with the same data
   
   Materialization is needed when: 
    - we need the DATAPTR and the Array has nulls
    - we need a duplicate
   
   Get_region() does not need to materialize the full vector.
   Creating the altrep object does not alter the data to impose the R sentinel. 
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r699105566



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       In a way yes, but the user should not want that because the object is marked as immutable (in the R sense). Unfortunately, this is more of a guideline than something that is enforced by R. 
   
   Accessing data read-only is somewhat new in R, and lots of internal R code would still call `DATAPTR()` even though a read)-only equivalent would be sufficient. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] bkietz commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
bkietz commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r671401636



##########
File path: r/src/array_to_vector.cpp
##########
@@ -69,13 +69,25 @@ class Converter {
     // special case when there is only one array
     if (chunked_array_->num_chunks() == 1) {
       const auto& array = chunked_array_->chunk(0);
-      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 &&
-          array->null_count() == 0) {
+      // using altrep if
+      // - the arrow.use_altrep is set to TRUE or unset
+      // - the array has at least one element
+      // - either it has no nulls or the data is mutable
+      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) {
         switch (array->type()->id()) {
           case arrow::Type::DOUBLE:
-            return arrow::r::MakeDoubleArrayNoNull(array);
+            if (array->null_count() == 0 || array->data()->buffers[1]->is_mutable()) {

Review comment:
       @pitrou 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r690217725



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&

Review comment:
       This is no longer being used. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-881286881


   follow up after https://github.com/apache/arrow/pull/10593


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672305324



##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Thanks. I'll update to add more things to `expect_altrep_rountrip()`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r671333104



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();
+
+template <>
+inline double na_sentinel<double>() {
+  return NA_REAL;
+}
+
+template <>
+inline int na_sentinel<int>() {
+  return NA_INTEGER;
+}
+
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {
+  auto n = array->length();
+  auto null_count = array->null_count();
+  internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), n);
+
+  auto* data = array->data()->GetMutableValues<T>(1);
+
+  for (R_xlen_t i = 0, k = 0; k < null_count; i++, bitmap_reader.Next()) {
+    if (bitmap_reader.IsNotSet()) {
+      k++;
+      data[i] = na_sentinel<T>();
+    }
+  }
+}
+
 template <int sexp_type>
-struct ArrayNoNull {
+struct AltrepVector {
   using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
+  // altrep object around an Array
   // data1: an external pointer to a shared pointer to the Array
   // data2: not used
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array,
+                   RTasks& tasks) {
     // we don't need the whole r6 object, just an external pointer
-    // that retain the array
+    // that retain the Array
     Pointer xp(new std::shared_ptr<Array>(array));
 
+    // we only get here if the Array data buffer is mutable
+    // UseSentinel() puts the R sentinel where the data is null
+    auto null_count = array->null_count();
+    if (null_count > 0) {
+      tasks.Append(true, [array]() {
+        UseSentinel<data_type>(array);

Review comment:
       Not sure how to do this. Maybe this can be a follow up issue. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jonkeane commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
jonkeane commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909688448


   Yeah, we should run these manually. I'll try to get to that tomorrow. None of the array conversion benchmarks have been wired up in the benchmarks package, so we don't have them available on conbench right now.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672174111



##########
File path: r/src/array_to_vector.cpp
##########
@@ -69,13 +69,25 @@ class Converter {
     // special case when there is only one array
     if (chunked_array_->num_chunks() == 1) {
       const auto& array = chunked_array_->chunk(0);
-      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 &&
-          array->null_count() == 0) {
+      // using altrep if
+      // - the arrow.use_altrep is set to TRUE or unset
+      // - the array has at least one element
+      // - either it has no nulls or the data is mutable
+      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) {
         switch (array->type()->id()) {
           case arrow::Type::DOUBLE:
-            return arrow::r::MakeDoubleArrayNoNull(array);
+            if (array->null_count() == 0 || array->data()->buffers[1]->is_mutable()) {

Review comment:
       One consequence is that we can't use altrep when taking slices of an Array as the buffer is at least held by two Arrays then. 
   
   ``` r
   library(arrow, warn.conflicts = FALSE)
   #> See arrow_info() for available features
   
   is_altrep <- arrow:::is_altrep
   
   v_int <- Array$create(c(1L, NA, 3L))
   
   is_altrep(v_int$as_vector())
   #> [1] TRUE
   
   is_altrep(as.vector(v_int$Slice(1)))
   #> [1] FALSE
   ```
   
   <sup>Created on 2021-07-19 by the [reprex package](https://reprex.tidyverse.org) (v2.0.0)</sup>
   
   I commented out the relevant tests in `test-altrep.R`. Does this matter given that the data there is irrelevant because it's identified as null ? 
   
   

##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       specifically here when mutating does not logically change anything. it's my understanding that what's there is undefined and that it is inconsequential to change it. 
   
   it may be false however when the buffer comes from a source that prevents mutation, I don't have a specific example but I guess reading from disk or something. 

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Thanks. I'll update to add more things to `expect_altrep_rountrip()`

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       Thanks. R indeed converts to numeric in that case: 
   
   ``` r
   str(sum(as.integer(c(-2^31 + 1L))))
   #>  int -2147483647
   str(sum(as.integer(c(-2^31 + 1L, -1L))))
   #>  num -2.15e+09
   ```
   
   <sup>Created on 2021-07-19 by the [reprex package](https://reprex.tidyverse.org) (v2.0.0)</sup>

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Oh interesting. I guess the advantage of this approach is that, for the price of this initial setting of the sentinels, we can then have a `DATAPTR()` so access all the data of the array randomly. 
   
   OTOH, we could leave the data untouched and have an `_Elt` that would check the null bitmap, and an `GetRegion()` that would also locally use the null bitmap. This `GetRegion()` would indeed copy the region data though, so potentially be more expensive, even when the region has no nulls. 

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Chances are we might need to implement something like this for arrays that are backed by an immutable buffer, so maybe I can do that, and then we can compare/measure and evaluate how risky this is. 
   
   Then we have 3 cases: 
    - Array with no nulls: best altrep with DATAPTR(), no need for setting sentinels (what we had prior to this pull request). 
    - Array with some nulls, immutable: using GetRegion, I guess with a materialized R vector in the data2 altrep slot, so that DATAPTR() materializes and returns this. 
    
    - Array with some nulls, "mutable": hacky proposal of this pr. 
   
   Perhaps we only need the first 2. 
   
   
   
   
   
   

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Also, I believe the second dot will have some code shared with altrep for ChunkedArray. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909604178


   @ursabot please benchmark lang=R


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r671325520



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();

Review comment:
       We can :-)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] ursabot commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
ursabot commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909604811


   Benchmark runs are scheduled for baseline = f60bc1267aa286713d55fc89137b4afcae3e9d0a and contender = 763781c97ef12054dc710bc9d78965cdd300a66a. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Skipped :warning: Provided benchmark filters do not have any benchmark groups to be executed on ec2-t3-xlarge-us-east-2] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/d93c570cecb14e40aba6b49ad471f722...c1fe26cb25934a36afda3e1313778ab3/)
   [Scheduled] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/0d68e720cbea4e8ab4c704223d2aa056...31c5301553bb49ec92428f1f83bfc6f5/)
   [Skipped :warning: Only ['C++', 'Java'] langs are supported on ursa-thinkcentre-m75q] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/12dd2936d12a4e52b797b92f4b487518...b1e1451732f1491fbb6f6f46ac117fee/)
   Supported benchmarks:
   ursa-i9-9960x: langs = Python, R, JavaScript
   ursa-thinkcentre-m75q: langs = C++, Java
   ec2-t3-xlarge-us-east-2: cloud = True
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-907196505


   > maybe the data has been written into
   
   We can write tests for this behavior, yeah?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-911660345


   > * multi-chunk arrays aren't (yet) supported
   
   That's ARROW-13111. ARROW-13112 is for supporting other types. If I understand correctly, we're conceptually close on those since we're implementing the get_region and elt methods for integer and float here, just ("just"!) need to [something about templates] and do the same for other types and for ChunkedArrays using our existing conversion code.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-881286794


   https://issues.apache.org/jira/browse/ARROW-13164


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r699105566



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       In a way yes, but the user should not want that because the object is marked as immutable (in the R sense). Unfortunately, this is more of a guideline than something that is enforced by R. 
   
   Accessing data read-only is somewhat new in R, and lots of internal R code would still call `DATAPTR()` even though a read)-only equivalent would be sufficient. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jonkeane commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
jonkeane commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-910566198


   And here's the report:
   
   [altvec-with-nulls.html.zip](https://github.com/apache/arrow/files/7093577/altvec-with-nulls.html.zip)
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672304332



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       specifically here when mutating does not logically change anything. it's my understanding that what's there is undefined and that it is inconsequential to change it. 
   
   it may be false however when the buffer comes from a source that prevents mutation, I don't have a specific example but I guess reading from disk or something. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909200812


   Revision: 763781c97ef12054dc710bc9d78965cdd300a66a
   
   Submitted crossbow builds: [ursacomputing/crossbow @ actions-806](https://github.com/ursacomputing/crossbow/branches/all?query=actions-806)
   
   |Task|Status|
   |----|------|
   |conda-linux-gcc-py36-cpu-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-linux-gcc-py36-cpu-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-linux-gcc-py36-cpu-r40)|
   |conda-linux-gcc-py37-cpu-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-linux-gcc-py37-cpu-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-linux-gcc-py37-cpu-r41)|
   |conda-osx-clang-py36-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-osx-clang-py36-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-osx-clang-py36-r40)|
   |conda-osx-clang-py37-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-osx-clang-py37-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-osx-clang-py37-r41)|
   |conda-win-vs2017-py36-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-win-vs2017-py36-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-win-vs2017-py36-r40)|
   |conda-win-vs2017-py37-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-win-vs2017-py37-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-win-vs2017-py37-r41)|
   |homebrew-r-autobrew|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-homebrew-r-autobrew)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-homebrew-r-autobrew)|
   |test-r-depsource-auto|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-depsource-auto)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-depsource-auto)|
   |test-r-depsource-system|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-depsource-system)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-depsource-system)|
   |test-r-devdocs|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-devdocs)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-devdocs)|
   |test-r-gcc-11|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-gcc-11)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-gcc-11)|
   |test-r-install-local|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-install-local)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-install-local)|
   |test-r-linux-as-cran|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-linux-as-cran)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-linux-as-cran)|
   |test-r-linux-rchk|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-linux-rchk)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-linux-rchk)|
   |test-r-linux-valgrind|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-linux-valgrind)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-linux-valgrind)|
   |test-r-minimal-build|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-minimal-build)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-minimal-build)|
   |test-r-rhub-debian-gcc-devel-lto-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rhub-debian-gcc-devel-lto-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rhub-debian-gcc-devel-lto-latest)|
   |test-r-rhub-ubuntu-gcc-release-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rhub-ubuntu-gcc-release-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rhub-ubuntu-gcc-release-latest)|
   |test-r-rocker-r-base-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rocker-r-base-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rocker-r-base-latest)|
   |test-r-rstudio-r-base-3.6-bionic|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-bionic)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-bionic)|
   |test-r-rstudio-r-base-3.6-centos7-devtoolset-8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)|
   |test-r-rstudio-r-base-3.6-centos8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos8)|
   |test-r-rstudio-r-base-3.6-opensuse15|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse15)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse15)|
   |test-r-rstudio-r-base-3.6-opensuse42|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse42)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse42)|
   |test-r-ubuntu-21.04|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-ubuntu-21.04)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-ubuntu-21.04)|
   |test-r-version-compatibility|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-version-compatibility)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-version-compatibility)|
   |test-r-versions|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-versions)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-versions)|
   |test-ubuntu-18.04-r-sanitizer|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-ubuntu-18.04-r-sanitizer)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-ubuntu-18.04-r-sanitizer)|


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909200812


   Revision: 763781c97ef12054dc710bc9d78965cdd300a66a
   
   Submitted crossbow builds: [ursacomputing/crossbow @ actions-806](https://github.com/ursacomputing/crossbow/branches/all?query=actions-806)
   
   |Task|Status|
   |----|------|
   |conda-linux-gcc-py36-cpu-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-linux-gcc-py36-cpu-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-linux-gcc-py36-cpu-r40)|
   |conda-linux-gcc-py37-cpu-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-linux-gcc-py37-cpu-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-linux-gcc-py37-cpu-r41)|
   |conda-osx-clang-py36-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-osx-clang-py36-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-osx-clang-py36-r40)|
   |conda-osx-clang-py37-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-osx-clang-py37-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-osx-clang-py37-r41)|
   |conda-win-vs2017-py36-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-win-vs2017-py36-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-win-vs2017-py36-r40)|
   |conda-win-vs2017-py37-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-win-vs2017-py37-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-win-vs2017-py37-r41)|
   |homebrew-r-autobrew|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-homebrew-r-autobrew)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-homebrew-r-autobrew)|
   |test-r-depsource-auto|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-depsource-auto)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-depsource-auto)|
   |test-r-depsource-system|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-depsource-system)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-depsource-system)|
   |test-r-devdocs|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-devdocs)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-devdocs)|
   |test-r-gcc-11|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-gcc-11)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-gcc-11)|
   |test-r-install-local|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-install-local)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-install-local)|
   |test-r-linux-as-cran|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-linux-as-cran)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-linux-as-cran)|
   |test-r-linux-rchk|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-linux-rchk)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-linux-rchk)|
   |test-r-linux-valgrind|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-linux-valgrind)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-linux-valgrind)|
   |test-r-minimal-build|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-minimal-build)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-minimal-build)|
   |test-r-rhub-debian-gcc-devel-lto-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rhub-debian-gcc-devel-lto-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rhub-debian-gcc-devel-lto-latest)|
   |test-r-rhub-ubuntu-gcc-release-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rhub-ubuntu-gcc-release-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rhub-ubuntu-gcc-release-latest)|
   |test-r-rocker-r-base-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rocker-r-base-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rocker-r-base-latest)|
   |test-r-rstudio-r-base-3.6-bionic|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-bionic)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-bionic)|
   |test-r-rstudio-r-base-3.6-centos7-devtoolset-8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)|
   |test-r-rstudio-r-base-3.6-centos8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos8)|
   |test-r-rstudio-r-base-3.6-opensuse15|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse15)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse15)|
   |test-r-rstudio-r-base-3.6-opensuse42|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse42)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse42)|
   |test-r-ubuntu-21.04|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-ubuntu-21.04)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-ubuntu-21.04)|
   |test-r-version-compatibility|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-version-compatibility)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-version-compatibility)|
   |test-r-versions|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-versions)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-versions)|
   |test-ubuntu-18.04-r-sanitizer|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-ubuntu-18.04-r-sanitizer)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-ubuntu-18.04-r-sanitizer)|


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672314835



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Oh interesting. I guess the advantage of this approach is that, for the price of this initial setting of the sentinels, we can then have a `DATAPTR()` so access all the data of the array randomly. 
   
   OTOH, we could leave the data untouched and have an `_Elt` that would check the null bitmap, and an `GetRegion()` that would also locally use the null bitmap. This `GetRegion()` would indeed copy the region data though, so potentially be more expensive, even when the region has no nulls. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692225918



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {

Review comment:
       Those are the R names, we can't do much about this here. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692226676



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {

Review comment:
       Just trying to align with the R conventions used by the functions calling this. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r671330603



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();
+
+template <>
+inline double na_sentinel<double>() {
+  return NA_REAL;
+}
+
+template <>
+inline int na_sentinel<int>() {
+  return NA_INTEGER;
+}
+
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {
+  auto n = array->length();
+  auto null_count = array->null_count();
+  internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), n);
+
+  auto* data = array->data()->GetMutableValues<T>(1);
+
+  for (R_xlen_t i = 0, k = 0; k < null_count; i++, bitmap_reader.Next()) {
+    if (bitmap_reader.IsNotSet()) {
+      k++;
+      data[i] = na_sentinel<T>();
+    }
+  }
+}
+
 template <int sexp_type>
-struct ArrayNoNull {
+struct AltrepVector {
   using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
+  // altrep object around an Array
   // data1: an external pointer to a shared pointer to the Array
   // data2: not used
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array,
+                   RTasks& tasks) {
     // we don't need the whole r6 object, just an external pointer
-    // that retain the array
+    // that retain the Array
     Pointer xp(new std::shared_ptr<Array>(array));
 
+    // we only get here if the Array data buffer is mutable
+    // UseSentinel() puts the R sentinel where the data is null
+    auto null_count = array->null_count();
+    if (null_count > 0) {

Review comment:
       maybe, but once we're in Dataptr/Getregion we no longer have the possibility to do this in a task. 
   
   Here, we always do it (although we might not need to), but in a task (so in parallel). 
   If we did it in Dataptr/Getregion we would have to bookkeep if it was done already and if not do it serially. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r674161781



##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,121 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+expect_altrep_rountrip <- function(x, fn, ...) {
+  alt <- Array$create(x)$as_vector()
+
+  expect_true(is_altrep(alt))
+  expect_identical(fn(x, ...), fn(alt, ...))
+  expect_true(is_altrep(alt))
+}
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  x <- c(1, 2, 3)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- c(1, 2, NA_real_)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- rep(NA_real_, 3)
+  expect_warning(expect_altrep_rountrip(x, min, na.rm = TRUE))
+  expect_warning(expect_altrep_rountrip(x, max, na.rm = TRUE))

Review comment:
       Is there a way to test the precise warning that is being emitted here? (or at least a substring thereof)

##########
File path: r/src/altrep.cpp
##########
@@ -19,9 +19,14 @@
 
 #if defined(ARROW_R_WITH_ARROW)
 
+#include <arrow/compute/api.h>

Review comment:
       Would be nice if you could cluster all Arrow includes together.

##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       Hmm, yes, I guess it _should_ be logicall inconsequential. OTOH, it may come as a surprise if people e.g. blindlessly memory-map a file in read-write mode without imagining that RArrow will modify it.

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> Options(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
 
   static void Init(R_altrep_class_t class_t, DllInfo* dll) {
     // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    R_set_altrep_Length_method(class_t, AltrepVector::Length);
+    R_set_altrep_Inspect_method(class_t, AltrepVector::Inspect);
+    R_set_altrep_Duplicate_method(class_t, AltrepVector::Duplicate);
 
     // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    R_set_altvec_Dataptr_method(class_t, AltrepVector::Dataptr);
+    R_set_altvec_Dataptr_or_null_method(class_t, AltrepVector::Dataptr_or_null);
   }
 };
 
-struct DoubleArrayNoNull {
+struct AltrepVectorDouble {
+  using Base = AltrepVector<REALSXP>;
   static R_altrep_class_t class_t;
 
   static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    class_t = R_make_altreal_class("array_dbl_vector", "arrow", dll);
+    Base::Init(class_t, dll);
+
+    R_set_altreal_No_NA_method(class_t, Base::No_NA);
+
+    R_set_altreal_Sum_method(class_t, Base::Sum);
+    R_set_altreal_Min_method(class_t, Base::Min);
+    R_set_altreal_Max_method(class_t, Base::Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return Base::Make(class_t, array, tasks);
   }
 };
 
-struct Int32ArrayNoNull {
+struct AltrepVectorInt32 {
+  using Base = AltrepVector<INTSXP>;
   static R_altrep_class_t class_t;
 
   static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+    class_t = R_make_altinteger_class("array_int_vector", "arrow", dll);
+    Base::Init(class_t, dll);
+    R_set_altinteger_No_NA_method(class_t, Base::No_NA);
+
+    R_set_altinteger_Sum_method(class_t, Base::Sum);
+    R_set_altinteger_Min_method(class_t, Base::Min);
+    R_set_altinteger_Max_method(class_t, Base::Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return Base::Make(class_t, array, tasks);
   }
 };
 
-R_altrep_class_t Int32ArrayNoNull::class_t;
-R_altrep_class_t DoubleArrayNoNull::class_t;
+R_altrep_class_t AltrepVectorInt32::class_t;
+R_altrep_class_t AltrepVectorDouble::class_t;
 
 void Init_Altrep_classes(DllInfo* dll) {
-  DoubleArrayNoNull::Init(dll);
-  Int32ArrayNoNull::Init(dll);
+  AltrepVectorDouble::Init(dll);
+  AltrepVectorInt32::Init(dll);
 }
 
-SEXP MakeDoubleArrayNoNull(const std::shared_ptr<Array>& array) {
-  return DoubleArrayNoNull::Make(array);
+SEXP MakeAltrepVectorDouble(const std::shared_ptr<Array>& array, RTasks& tasks) {

Review comment:
       Since `tasks` is a mutable argument, can you make it `RTasks* tasks`?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,121 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+expect_altrep_rountrip <- function(x, fn, ...) {
+  alt <- Array$create(x)$as_vector()
+
+  expect_true(is_altrep(alt))
+  expect_identical(fn(x, ...), fn(alt, ...))
+  expect_true(is_altrep(alt))
+}
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  x <- c(1, 2, 3)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- c(1, 2, NA_real_)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- rep(NA_real_, 3)
+  expect_warning(expect_altrep_rountrip(x, min, na.rm = TRUE))
+  expect_warning(expect_altrep_rountrip(x, max, na.rm = TRUE))
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+})
+
+test_that("altrep min/max/sum identical to R versions for int", {
+  x <- c(1L, 2L, 3L)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- c(1L, 2L, NA_integer_)
+  expect_altrep_rountrip(x, min, na.rm = TRUE)
+  expect_altrep_rountrip(x, max, na.rm = TRUE)
+  expect_altrep_rountrip(x, sum, na.rm = TRUE)
+
+  expect_altrep_rountrip(x, min)
+  expect_altrep_rountrip(x, max)
+  expect_altrep_rountrip(x, sum)
+
+  x <- rep(NA_integer_, 3)
+  expect_warning(expect_altrep_rountrip(x, min, na.rm = TRUE))
+  expect_warning(expect_altrep_rountrip(x, max, na.rm = TRUE))

Review comment:
       Same here.

##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&

Review comment:
       Why the `use_count` check? Even if the Buffer _object_ is unshared, the underlying _data_ may be shared (for example because a slice was made of a parent Buffer).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692093188



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +44,68 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+T na_sentinel();
+
+template <>
+inline double na_sentinel<double>() {
+  return NA_REAL;
+}
+
+template <>
+inline int na_sentinel<int>() {
+  return NA_INTEGER;
+}
+
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {
+  auto n = array->length();
+  auto null_count = array->null_count();
+  internal::BitmapReader bitmap_reader(array->null_bitmap()->data(), array->offset(), n);
+
+  auto* data = array->data()->GetMutableValues<T>(1);
+
+  for (R_xlen_t i = 0, k = 0; k < null_count; i++, bitmap_reader.Next()) {
+    if (bitmap_reader.IsNotSet()) {
+      k++;
+      data[i] = na_sentinel<T>();
+    }
+  }
+}
+
 template <int sexp_type>
-struct ArrayNoNull {
+struct AltrepVector {
   using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
+  // altrep object around an Array
   // data1: an external pointer to a shared pointer to the Array
   // data2: not used
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
+  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array,
+                   RTasks& tasks) {
     // we don't need the whole r6 object, just an external pointer
-    // that retain the array
+    // that retain the Array
     Pointer xp(new std::shared_ptr<Array>(array));
 
+    // we only get here if the Array data buffer is mutable
+    // UseSentinel() puts the R sentinel where the data is null
+    auto null_count = array->null_count();
+    if (null_count > 0) {
+      tasks.Append(true, [array]() {
+        UseSentinel<data_type>(array);

Review comment:
       no longer in use




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672252916



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       I thought arrays were immutable? when/how likely is this to be true?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Should this also assert altrep-ness, maybe something like this?
   
   ```suggestion
       alt <- Array$create(x)$as_vector()
       expect_true(is_altrep(alt))
       result <- fn(alt)
       # Calling fn() didn't cause the vector to lose its altrep-ness
       expect_true(is_altrep(alt))
       expect_identical(result, fn(x, ...), ...))
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       `INT32_MIN` is `NA_integer_` right? So it needs to be out of bounds too. Or does this constant come from R and already have that factored in?
   
   ```suggestion
         if (value <= INT32_MIN || value > INT32_MAX) {
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       I would have expected that, instead of altering the Arrow data, we would have `_ELT` and `_GET_REGION` methods that would insert the sentinels in the R vector when the data is accessed. What are the tradeoffs of that way vs. this way?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692108340



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +131,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {

Review comment:
       Can you remove this method?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {

Review comment:
       1) Can you add comments describing non-trivial methods or functions?
   2) Can you try to follow the naming conventions (here `GetRegion`)?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {
+    const auto& slice = array_->Slice(i, n);

Review comment:
       I don't think you can take a const reference here. `Array::Slice` is returning a new `shared_ptr`.

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       Does `writeable` mean the user wants to write to the data? If so, we should not return the original Arrow data in this case.

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;

Review comment:
       Nit: call this `c_type`?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {
+    const auto& slice = array_->Slice(i, n);
+
+    R_xlen_t ncopy = slice->length();
+
+    if (IsMaterialized()) {
+      // just use data2
+      memcpy(buf, reinterpret_cast<data_type*>(DATAPTR(data2())) + i,
+             ncopy * sizeof(data_type));

Review comment:
       If it's already materialized, why do a copy again? Is there a situation in which this code path is executed?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {

Review comment:
       Why do we have `Dataptr`, `DATAPTR`, `data1`, `data2`... can you try to make the names clearer and/or add comments explaining the intent?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692227273



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       I need to check that, but yeah probably




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] ursabot commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
ursabot commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909604811


   Benchmark runs are scheduled for baseline = f60bc1267aa286713d55fc89137b4afcae3e9d0a and contender = 763781c97ef12054dc710bc9d78965cdd300a66a. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Skipped :warning: Provided benchmark filters do not have any benchmark groups to be executed on ec2-t3-xlarge-us-east-2] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/d93c570cecb14e40aba6b49ad471f722...c1fe26cb25934a36afda3e1313778ab3/)
   [Scheduled] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/0d68e720cbea4e8ab4c704223d2aa056...31c5301553bb49ec92428f1f83bfc6f5/)
   [Skipped :warning: Only ['C++', 'Java'] langs are supported on ursa-thinkcentre-m75q] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/12dd2936d12a4e52b797b92f4b487518...b1e1451732f1491fbb6f6f46ac117fee/)
   Supported benchmarks:
   ursa-i9-9960x: langs = Python, R, JavaScript
   ursa-thinkcentre-m75q: langs = C++, Java
   ec2-t3-xlarge-us-east-2: cloud = True
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-908465941


   The `AltrepArrayPrimitive` class now marks the altrep object as immutable. This makes this "a bad idea" to call `
    Dataptr(writeable = TRUE)` on it, and it should rather be accessed read-only. 
   
   However, if we make this an error, this fails in many many places, including `identical()` so we can't do that. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson closed pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson closed pull request #10730:
URL: https://github.com/apache/arrow/pull/10730


   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672252916



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       I thought arrays were immutable? when/how likely is this to be true?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Should this also assert altrep-ness, maybe something like this?
   
   ```suggestion
       alt <- Array$create(x)$as_vector()
       expect_true(is_altrep(alt))
       result <- fn(alt)
       # Calling fn() didn't cause the vector to lose its altrep-ness
       expect_true(is_altrep(alt))
       expect_identical(result, fn(x, ...), ...))
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       `INT32_MIN` is `NA_integer_` right? So it needs to be out of bounds too. Or does this constant come from R and already have that factored in?
   
   ```suggestion
         if (value <= INT32_MIN || value > INT32_MAX) {
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       I would have expected that, instead of altering the Arrow data, we would have `_ELT` and `_GET_REGION` methods that would insert the sentinels in the R vector when the data is accessed. What are the tradeoffs of that way vs. this way?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jonkeane commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
jonkeane commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-910562170


   OK, I just ran Array to R benchmarks locally, and the results are very impressive. I've added in other types (even though they are not (yet) backed by altrep). I've also tested both chunked and non-chunked arrays, and with nulls and without.
   
   TL;DR:
   
   For floats + ints we see huge speed ups (~10x faster) in both arrays with and without nulls as well as in arrays and chunked arrays (at least in the case where there is a single chunk*).
   
   I'm still investigating what's going on with the two fannie parquet file reads that show up as regressions. If anything, this should have sped them up. 
   
   * – we don't yet have fixtures with chunked arrays with multiple chunks. But I simulated this with a csv (which does come in with multiple chunks) and multi-chunk arrays aren't (yet) supported:
   
   ``` r
   library(arrow)
   
   nyctaxi <- read_csv_arrow("~/repos/ab_store/data/nyctaxi_2010-01.csv.gz", as_data_frame = FALSE)
   
   chunked_array <- as.vector(nyctaxi[[5]])
   .Internal(inspect(chunked_array))
   #> @7fb218000000 14 REALSXP g0c7 [REF(4)] (len=14863778, tl=0) 0.75,5.9,4,4.7,0.6,...
   
   array <- as.vector(nyctaxi[[5]]$chunk(1))
   .Internal(inspect(array))
   #> @7fb20dc3a6e0 14 REALSXP g0c0 [REF(65535)] arrow::Array<double, NONULL> len=5738, Array=<0x7fb25d5080d8>
   #>   @7fb20dc3a670 22 EXTPTRSXP g0c0 [REF(4)]
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672324588



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Chances are we might need to implement something like this for arrays that are backed by an immutable buffer, so maybe I can do that, and then we can compare/measure and evaluate how risky this is. 
   
   Then we have 3 cases: 
    - Array with no nulls: best altrep with DATAPTR(), no need for setting sentinels (what we had prior to this pull request). 
    - Array with some nulls, immutable: using GetRegion, I guess with a materialized R vector in the data2 altrep slot, so that DATAPTR() materializes and returns this. 
    
    - Array with some nulls, "mutable": hacky proposal of this pr. 
   
   Perhaps we only need the first 2. 
   
   
   
   
   
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672304332



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       specifically here when mutating does not logically change anything. it's my understanding that what's there is undefined and that it is inconsequential to change it. 
   
   it may be false however when the buffer comes from a source that prevents mutation, I don't have a specific example but I guess reading from disk or something. 

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Thanks. I'll update to add more things to `expect_altrep_rountrip()`

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       Thanks. R indeed converts to numeric in that case: 
   
   ``` r
   str(sum(as.integer(c(-2^31 + 1L))))
   #>  int -2147483647
   str(sum(as.integer(c(-2^31 + 1L, -1L))))
   #>  num -2.15e+09
   ```
   
   <sup>Created on 2021-07-19 by the [reprex package](https://reprex.tidyverse.org) (v2.0.0)</sup>

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Oh interesting. I guess the advantage of this approach is that, for the price of this initial setting of the sentinels, we can then have a `DATAPTR()` so access all the data of the array randomly. 
   
   OTOH, we could leave the data untouched and have an `_Elt` that would check the null bitmap, and an `GetRegion()` that would also locally use the null bitmap. This `GetRegion()` would indeed copy the region data though, so potentially be more expensive, even when the region has no nulls. 

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Chances are we might need to implement something like this for arrays that are backed by an immutable buffer, so maybe I can do that, and then we can compare/measure and evaluate how risky this is. 
   
   Then we have 3 cases: 
    - Array with no nulls: best altrep with DATAPTR(), no need for setting sentinels (what we had prior to this pull request). 
    - Array with some nulls, immutable: using GetRegion, I guess with a materialized R vector in the data2 altrep slot, so that DATAPTR() materializes and returns this. 
    
    - Array with some nulls, "mutable": hacky proposal of this pr. 
   
   Perhaps we only need the first 2. 
   
   
   
   
   
   

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Also, I believe the second dot will have some code shared with altrep for ChunkedArray. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909200122


   @github-actions crossbow submit -g r


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r699105566



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       In a way yes, but the user should not want that because the object is marked as immutable (in the R sense). Unfortunately, this is more of a guideline than something that is enforced by R. 
   
   Accessing data read-only is somewhat new in R, and lots of internal R code would still call `DATAPTR()` even though a read)-only equivalent would be sufficient. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672308568



##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       Thanks. R indeed converts to numeric in that case: 
   
   ``` r
   str(sum(as.integer(c(-2^31 + 1L))))
   #>  int -2147483647
   str(sum(as.integer(c(-2^31 + 1L, -1L))))
   #>  num -2.15e+09
   ```
   
   <sup>Created on 2021-07-19 by the [reprex package](https://reprex.tidyverse.org) (v2.0.0)</sup>




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672174111



##########
File path: r/src/array_to_vector.cpp
##########
@@ -69,13 +69,25 @@ class Converter {
     // special case when there is only one array
     if (chunked_array_->num_chunks() == 1) {
       const auto& array = chunked_array_->chunk(0);
-      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0 &&
-          array->null_count() == 0) {
+      // using altrep if
+      // - the arrow.use_altrep is set to TRUE or unset
+      // - the array has at least one element
+      // - either it has no nulls or the data is mutable
+      if (arrow::r::GetBoolOption("arrow.use_altrep", true) && array->length() > 0) {
         switch (array->type()->id()) {
           case arrow::Type::DOUBLE:
-            return arrow::r::MakeDoubleArrayNoNull(array);
+            if (array->null_count() == 0 || array->data()->buffers[1]->is_mutable()) {

Review comment:
       One consequence is that we can't use altrep when taking slices of an Array as the buffer is at least held by two Arrays then. 
   
   ``` r
   library(arrow, warn.conflicts = FALSE)
   #> See arrow_info() for available features
   
   is_altrep <- arrow:::is_altrep
   
   v_int <- Array$create(c(1L, NA, 3L))
   
   is_altrep(v_int$as_vector())
   #> [1] TRUE
   
   is_altrep(as.vector(v_int$Slice(1)))
   #> [1] FALSE
   ```
   
   <sup>Created on 2021-07-19 by the [reprex package](https://reprex.tidyverse.org) (v2.0.0)</sup>
   
   I commented out the relevant tests in `test-altrep.R`. Does this matter given that the data there is irrelevant because it's identified as null ? 
   
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672252916



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +146,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {
+    if (array->null_count() == 0) {
+      return true;
+    }
+
+    return array->data().use_count() == 1 && array->data()->buffers[1].use_count() == 1 &&
+           array->data()->buffers[1]->is_mutable();

Review comment:
       I thought arrays were immutable? when/how likely is this to be true?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -59,52 +59,117 @@ test_that("altrep vectors from int32 and dbl arrays with nulls", {
   c_int <- ChunkedArray$create(c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(c(1, NA, 3))
 
-  # cannot be altrep because one NA
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_int_nonull(as.vector(v_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(1))))
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_int_nonull(as.vector(c_int$Slice(1))))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(1))))
-
-  # but then, no NA beyond, so can be altrep again
-  expect_true(is_altrep_int_nonull(as.vector(v_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(v_dbl$Slice(2))))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(2))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(2))))
+  expect_true(is_altrep(as.vector(v_int)))
+  # expect_true(is_altrep(as.vector(v_int$Slice(1))))
+  expect_true(is_altrep(as.vector(v_dbl)))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(1))))
+  expect_true(is_altrep(as.vector(c_int)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(1))))
+  expect_true(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(1))))
+
+  # expect_true(is_altrep(as.vector(v_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(v_dbl$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_int$Slice(2))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(2))))
 
   # chunked array with 2 chunks cannot be altrep
   c_int <- ChunkedArray$create(0L, c(1L, NA, 3L))
   c_dbl <- ChunkedArray$create(0, c(1, NA, 3))
   expect_equal(c_int$num_chunks, 2L)
   expect_equal(c_dbl$num_chunks, 2L)
-  expect_false(is_altrep_int_nonull(as.vector(c_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(c_dbl)))
-  expect_true(is_altrep_int_nonull(as.vector(c_int$Slice(3))))
-  expect_true(is_altrep_dbl_nonull(as.vector(c_dbl$Slice(3))))
+
+  expect_false(is_altrep(as.vector(c_int)))
+  expect_false(is_altrep(as.vector(c_dbl)))
+  # expect_true(is_altrep(as.vector(c_int$Slice(3))))
+  # expect_true(is_altrep(as.vector(c_dbl$Slice(3))))
 })
 
 test_that("empty vectors are not altrep", {
   withr::local_options(list(arrow.use_altrep = TRUE))
   v_int <- Array$create(integer())
   v_dbl <- Array$create(numeric())
 
-  expect_false(is_altrep_int_nonull(as.vector(v_int)))
-  expect_false(is_altrep_dbl_nonull(as.vector(v_dbl)))
+  expect_false(is_altrep(as.vector(v_int)))
+  expect_false(is_altrep(as.vector(v_dbl)))
 })
 
 test_that("as.data.frame(<Table>, <RecordBatch>) can create altrep vectors", {
   withr::local_options(list(arrow.use_altrep = TRUE))
 
   table <- Table$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_table <- as.data.frame(table)
-  expect_true(is_altrep_int_nonull(df_table$int))
-  expect_true(is_altrep_dbl_nonull(df_table$dbl))
+  expect_true(is_altrep(df_table$int))
+  expect_true(is_altrep(df_table$dbl))
 
   batch <- RecordBatch$create(int = c(1L, 2L, 3L), dbl = c(1, 2, 3))
   df_batch <- as.data.frame(batch)
-  expect_true(is_altrep_int_nonull(df_batch$int))
-  expect_true(is_altrep_dbl_nonull(df_batch$dbl))
+  expect_true(is_altrep(df_batch$int))
+  expect_true(is_altrep(df_batch$dbl))
+})
+
+test_that("altrep min/max/sum identical to R versions for double", {
+  expect_altrep_rountrip <- function(x, fn, ...) {
+    expect_identical(fn(x, ...), fn(Array$create(x)$as_vector(), ...))

Review comment:
       Should this also assert altrep-ness, maybe something like this?
   
   ```suggestion
       alt <- Array$create(x)$as_vector()
       expect_true(is_altrep(alt))
       result <- fn(alt)
       # Calling fn() didn't cause the vector to lose its altrep-ness
       expect_true(is_altrep(alt))
       expect_identical(result, fn(x, ...), ...))
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value < INT32_MIN || value > INT32_MAX) {

Review comment:
       `INT32_MIN` is `NA_integer_` right? So it needs to be out of bounds too. Or does this constant come from R and already have that factored in?
   
   ```suggestion
         if (value <= INT32_MIN || value > INT32_MAX) {
   ```

##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       I would have expected that, instead of altering the Arrow data, we would have `_ELT` and `_GET_REGION` methods that would insert the sentinels in the R vector when the data is accessed. What are the tradeoffs of that way vs. this way?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909200122


   @github-actions crossbow submit -g r


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] ursabot edited a comment on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
ursabot edited a comment on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909604811


   Benchmark runs are scheduled for baseline = f60bc1267aa286713d55fc89137b4afcae3e9d0a and contender = 763781c97ef12054dc710bc9d78965cdd300a66a. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Skipped :warning: Provided benchmark filters do not have any benchmark groups to be executed on ec2-t3-xlarge-us-east-2] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/d93c570cecb14e40aba6b49ad471f722...c1fe26cb25934a36afda3e1313778ab3/)
   [Finished :arrow_down:2.19% :arrow_up:1.46%] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/0d68e720cbea4e8ab4c704223d2aa056...31c5301553bb49ec92428f1f83bfc6f5/)
   [Skipped :warning: Only ['C++', 'Java'] langs are supported on ursa-thinkcentre-m75q] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/12dd2936d12a4e52b797b92f4b487518...b1e1451732f1491fbb6f6f46ac117fee/)
   Supported benchmarks:
   ursa-i9-9960x: langs = Python, R, JavaScript
   ursa-thinkcentre-m75q: langs = C++, Java
   ec2-t3-xlarge-us-east-2: cloud = True
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-907114426


   🤔 I think now that means that if the data has been materialized, a `Dataptr(writeable = TRUE)` might have happened, and  maybe the data has been written into. To be on the safe side, perhaps we should now treat the Array as obsolete and only ever use `data2`


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692228919



##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {
+    const auto& slice = array_->Slice(i, n);
+
+    R_xlen_t ncopy = slice->length();
+
+    if (IsMaterialized()) {
+      // just use data2
+      memcpy(buf, reinterpret_cast<data_type*>(DATAPTR(data2())) + i,
+             ncopy * sizeof(data_type));

Review comment:
       This is what `Get_region` is about: copy the slice from `i` to `i + n` into `buf`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-907069375


   Changed `Dataptr(writeable = TRUE)` so that the data is materialized.  


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909200812


   Revision: 763781c97ef12054dc710bc9d78965cdd300a66a
   
   Submitted crossbow builds: [ursacomputing/crossbow @ actions-806](https://github.com/ursacomputing/crossbow/branches/all?query=actions-806)
   
   |Task|Status|
   |----|------|
   |conda-linux-gcc-py36-cpu-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-linux-gcc-py36-cpu-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-linux-gcc-py36-cpu-r40)|
   |conda-linux-gcc-py37-cpu-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-linux-gcc-py37-cpu-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-linux-gcc-py37-cpu-r41)|
   |conda-osx-clang-py36-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-osx-clang-py36-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-osx-clang-py36-r40)|
   |conda-osx-clang-py37-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-osx-clang-py37-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-osx-clang-py37-r41)|
   |conda-win-vs2017-py36-r40|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-win-vs2017-py36-r40)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-win-vs2017-py36-r40)|
   |conda-win-vs2017-py37-r41|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-conda-win-vs2017-py37-r41)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-conda-win-vs2017-py37-r41)|
   |homebrew-r-autobrew|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-homebrew-r-autobrew)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-homebrew-r-autobrew)|
   |test-r-depsource-auto|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-depsource-auto)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-depsource-auto)|
   |test-r-depsource-system|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-depsource-system)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-depsource-system)|
   |test-r-devdocs|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-devdocs)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-devdocs)|
   |test-r-gcc-11|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-gcc-11)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-gcc-11)|
   |test-r-install-local|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-install-local)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-install-local)|
   |test-r-linux-as-cran|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-linux-as-cran)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-linux-as-cran)|
   |test-r-linux-rchk|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-linux-rchk)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-linux-rchk)|
   |test-r-linux-valgrind|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-linux-valgrind)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-linux-valgrind)|
   |test-r-minimal-build|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-minimal-build)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-minimal-build)|
   |test-r-rhub-debian-gcc-devel-lto-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rhub-debian-gcc-devel-lto-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rhub-debian-gcc-devel-lto-latest)|
   |test-r-rhub-ubuntu-gcc-release-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rhub-ubuntu-gcc-release-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rhub-ubuntu-gcc-release-latest)|
   |test-r-rocker-r-base-latest|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rocker-r-base-latest)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rocker-r-base-latest)|
   |test-r-rstudio-r-base-3.6-bionic|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-bionic)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-bionic)|
   |test-r-rstudio-r-base-3.6-centos7-devtoolset-8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos7-devtoolset-8)|
   |test-r-rstudio-r-base-3.6-centos8|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos8)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-centos8)|
   |test-r-rstudio-r-base-3.6-opensuse15|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse15)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse15)|
   |test-r-rstudio-r-base-3.6-opensuse42|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse42)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-r-rstudio-r-base-3.6-opensuse42)|
   |test-r-ubuntu-21.04|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-ubuntu-21.04)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-ubuntu-21.04)|
   |test-r-version-compatibility|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-version-compatibility)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-version-compatibility)|
   |test-r-versions|[![Github Actions](https://github.com/ursacomputing/crossbow/workflows/Crossbow/badge.svg?branch=actions-806-github-test-r-versions)](https://github.com/ursacomputing/crossbow/actions?query=branch:actions-806-github-test-r-versions)|
   |test-ubuntu-18.04-r-sanitizer|[![Azure](https://dev.azure.com/ursacomputing/crossbow/_apis/build/status/ursacomputing.crossbow?branchName=actions-806-azure-test-ubuntu-18.04-r-sanitizer)](https://dev.azure.com/ursacomputing/crossbow/_build/latest?definitionId=1&branchName=actions-806-azure-test-ubuntu-18.04-r-sanitizer)|


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] ursabot edited a comment on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
ursabot edited a comment on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909604811


   Benchmark runs are scheduled for baseline = f60bc1267aa286713d55fc89137b4afcae3e9d0a and contender = 763781c97ef12054dc710bc9d78965cdd300a66a. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Skipped :warning: Provided benchmark filters do not have any benchmark groups to be executed on ec2-t3-xlarge-us-east-2] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/d93c570cecb14e40aba6b49ad471f722...c1fe26cb25934a36afda3e1313778ab3/)
   [Finished :arrow_down:2.19% :arrow_up:1.46%] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/0d68e720cbea4e8ab4c704223d2aa056...31c5301553bb49ec92428f1f83bfc6f5/)
   [Skipped :warning: Only ['C++', 'Java'] langs are supported on ursa-thinkcentre-m75q] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/12dd2936d12a4e52b797b92f4b487518...b1e1451732f1491fbb6f6f46ac117fee/)
   Supported benchmarks:
   ursa-i9-9960x: langs = Python, R, JavaScript
   ursa-thinkcentre-m75q: langs = C++, Java
   ec2-t3-xlarge-us-east-2: cloud = True
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r672327530



##########
File path: r/src/altrep.cpp
##########
@@ -44,25 +49,58 @@ extern "C" {
 #endif
 
 #include <arrow/array.h>
+#include <arrow/util/bitmap_reader.h>
+
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+template <typename T>
+void UseSentinel(const std::shared_ptr<Array>& array) {

Review comment:
       Also, I believe the second dot will have some code shared with altrep for ChunkedArray. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-908465941


   The `AltrepArrayPrimitive` class now marks the altrep object as immutable. This makes this "a bad idea" to call `
    Dataptr(writeable = TRUE)` on it, and it should rather be accessed read-only. 
   
   However, if we make this an error, this fails in many many places, including `identical()` so we can't do that. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909603305


   I _think_ that's all clean. @jonkeane do you want to do any benchmarking before we merge this?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jonkeane commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
jonkeane commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909688448


   Yeah, we should run these manually. I'll try to get to that tomorrow. None of the array conversion benchmarks have been wired up in the benchmarks package, so we don't have them available on conbench right now.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nealrichardson commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
nealrichardson commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-909200122






-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#issuecomment-881533851


   this needs tests about min/max/sum


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] romainfrancois commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

Posted by GitBox <gi...@apache.org>.
romainfrancois commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r691973870



##########
File path: r/src/altrep.cpp
##########
@@ -104,63 +143,138 @@ struct ArrayNoNull {
     return const_cast<void*>(Dataptr_or_null(vec));
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  static int No_NA(SEXP vec) { return Get(vec)->null_count() == 0; }
+
+  static SEXP Min(SEXP x, Rboolean narm) { return MinMax(x, narm, "min", R_PosInf); }
+
+  static SEXP Max(SEXP x, Rboolean narm) { return MinMax(x, narm, "max", R_NegInf); }
+
+  static SEXP MinMax(SEXP x, Rboolean narm, const std::string& field, double inf) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto n = array->length();
+    auto null_count = array->null_count();
+    if ((na_rm || n == 0) && null_count == n) {
+      return Rf_ScalarReal(inf);
+    }
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+
+    auto options = Options(array, na_rm);
+
+    const auto& minmax =
+        ValueOrStop(arrow::compute::CallFunction("min_max", {array}, options.get()));
+    const auto& minmax_scalar =
+        internal::checked_cast<const StructScalar&>(*minmax.scalar());
+
+    const auto& result_scalar = internal::checked_cast<const scalar_type&>(
+        *ValueOrStop(minmax_scalar.field(field)));
+    return cpp11::as_sexp(result_scalar.value);
+  }
+
+  static SEXP Sum(SEXP x, Rboolean narm) {
+    const auto& array = Get(x);
+    bool na_rm = narm == TRUE;
+    auto null_count = array->null_count();
+
+    if (!na_rm && null_count > 0) {
+      return cpp11::as_sexp(cpp11::na<data_type>());
+    }
+    auto options = Options(array, na_rm);
+
+    const auto& sum =
+        ValueOrStop(arrow::compute::CallFunction("sum", {array}, options.get()));
+
+    if (sexp_type == INTSXP) {
+      // When calling the "sum" function on an int32 array, we get an Int64 scalar
+      // in case of overflow, make it a double like R
+      int64_t value = internal::checked_cast<const Int64Scalar&>(*sum.scalar()).value;
+      if (value <= INT32_MIN || value > INT32_MAX) {
+        return Rf_ScalarReal(static_cast<double>(value));
+      } else {
+        return Rf_ScalarInteger(static_cast<int>(value));
+      }
+    } else {
+      return Rf_ScalarReal(
+          internal::checked_cast<const DoubleScalar&>(*sum.scalar()).value);
+    }
+  }
+
+  static std::shared_ptr<arrow::compute::ScalarAggregateOptions> Options(
+      const std::shared_ptr<Array>& array, bool na_rm) {
+    auto options = std::make_shared<arrow::compute::ScalarAggregateOptions>(
+        arrow::compute::ScalarAggregateOptions::Defaults());
+    options->min_count = 0;
+    options->skip_nulls = na_rm;
+    return options;
+  }
 
   static void Init(R_altrep_class_t class_t, DllInfo* dll) {
     // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    R_set_altrep_Length_method(class_t, AltrepVector::Length);
+    R_set_altrep_Inspect_method(class_t, AltrepVector::Inspect);
+    R_set_altrep_Duplicate_method(class_t, AltrepVector::Duplicate);
 
     // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    R_set_altvec_Dataptr_method(class_t, AltrepVector::Dataptr);
+    R_set_altvec_Dataptr_or_null_method(class_t, AltrepVector::Dataptr_or_null);
   }
 };
 
-struct DoubleArrayNoNull {
+struct AltrepVectorDouble {
+  using Base = AltrepVector<REALSXP>;
   static R_altrep_class_t class_t;
 
   static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    class_t = R_make_altreal_class("array_dbl_vector", "arrow", dll);
+    Base::Init(class_t, dll);
+
+    R_set_altreal_No_NA_method(class_t, Base::No_NA);
+
+    R_set_altreal_Sum_method(class_t, Base::Sum);
+    R_set_altreal_Min_method(class_t, Base::Min);
+    R_set_altreal_Max_method(class_t, Base::Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return Base::Make(class_t, array, tasks);
   }
 };
 
-struct Int32ArrayNoNull {
+struct AltrepVectorInt32 {
+  using Base = AltrepVector<INTSXP>;
   static R_altrep_class_t class_t;
 
   static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+    class_t = R_make_altinteger_class("array_int_vector", "arrow", dll);
+    Base::Init(class_t, dll);
+    R_set_altinteger_No_NA_method(class_t, Base::No_NA);
+
+    R_set_altinteger_Sum_method(class_t, Base::Sum);
+    R_set_altinteger_Min_method(class_t, Base::Min);
+    R_set_altinteger_Max_method(class_t, Base::Max);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  static SEXP Make(const std::shared_ptr<Array>& array, RTasks& tasks) {
+    return Base::Make(class_t, array, tasks);
   }
 };
 
-R_altrep_class_t Int32ArrayNoNull::class_t;
-R_altrep_class_t DoubleArrayNoNull::class_t;
+R_altrep_class_t AltrepVectorInt32::class_t;
+R_altrep_class_t AltrepVectorDouble::class_t;
 
 void Init_Altrep_classes(DllInfo* dll) {
-  DoubleArrayNoNull::Init(dll);
-  Int32ArrayNoNull::Init(dll);
+  AltrepVectorDouble::Init(dll);
+  AltrepVectorInt32::Init(dll);
 }
 
-SEXP MakeDoubleArrayNoNull(const std::shared_ptr<Array>& array) {
-  return DoubleArrayNoNull::Make(array);
+SEXP MakeAltrepVectorDouble(const std::shared_ptr<Array>& array, RTasks& tasks) {

Review comment:
       we no longer use tasks there as there is nothing to parallelise




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org