You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/19 13:42:05 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #10730: ARROW-13164: [R] altrep vectors from Array with nulls

pitrou commented on a change in pull request #10730:
URL: https://github.com/apache/arrow/pull/10730#discussion_r692108340



##########
File path: r/src/array_to_vector.cpp
##########
@@ -133,6 +131,16 @@ class Converter {
 
  protected:
   std::shared_ptr<ChunkedArray> chunked_array_;
+
+ private:
+  bool CanAltrep(const std::shared_ptr<Array>& array) {

Review comment:
       Can you remove this method?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {

Review comment:
       1) Can you add comments describing non-trivial methods or functions?
   2) Can you try to follow the naming conventions (here `GetRegion`)?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {
+    const auto& slice = array_->Slice(i, n);

Review comment:
       I don't think you can take a const reference here. `Array::Slice` is returning a new `shared_ptr`.

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());

Review comment:
       Does `writeable` mean the user wants to write to the data? If so, we should not return the original Arrow data in this case.

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;

Review comment:
       Nit: call this `c_type`?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<void*>(array_data());
+    }
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altreal_class("array_nonull_dbl_vector", "arrow", dll);
-    ArrayNoNull<REALSXP>::Init(class_t, dll);
-    R_set_altreal_No_NA_method(class_t, ArrayNoNull<REALSXP>::No_NA);
+    Materialize();
+    return DATAPTR(data2());
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<REALSXP>::Make(class_t, array);
+  // by design, there are missing values in this vector
+  int No_NA() { return array_->null_count() != 0; }
+
+  // There are no guarantee that the data is the R sentinel,
+  // so we have to treat it specially
+  data_type Elt(R_xlen_t i) {
+    return array_->IsNull(i) ? cpp11::na<data_type>()
+                             : array_->data()->template GetValues<data_type>(1)[i];
   }
-};
 
-struct Int32ArrayNoNull {
-  static R_altrep_class_t class_t;
+  SEXP alt_;
+  const std::shared_ptr<Array>& array_;
 
-  static void Init(DllInfo* dll) {
-    class_t = R_make_altinteger_class("array_nonull_int_vector", "arrow", dll);
-    ArrayNoNull<INTSXP>::Init(class_t, dll);
-    R_set_altinteger_No_NA_method(class_t, ArrayNoNull<INTSXP>::No_NA);
+  const data_type* const_array_data() const {
+    return array_->data()->template GetValues<data_type>(1);
   }
 
-  static SEXP Make(const std::shared_ptr<Array>& array) {
-    return ArrayNoNull<INTSXP>::Make(class_t, array);
+  data_type* array_data() const {
+    return const_cast<data_type*>(array_->data()->template GetValues<data_type>(1));
+  }
+
+  R_xlen_t Get_region(R_xlen_t i, R_xlen_t n, data_type* buf) {
+    const auto& slice = array_->Slice(i, n);
+
+    R_xlen_t ncopy = slice->length();
+
+    if (IsMaterialized()) {
+      // just use data2
+      memcpy(buf, reinterpret_cast<data_type*>(DATAPTR(data2())) + i,
+             ncopy * sizeof(data_type));

Review comment:
       If it's already materialized, why do a copy again? Is there a situation in which this code path is executed?

##########
File path: r/src/altrep.cpp
##########
@@ -43,144 +50,356 @@ extern "C" {
 #include <R_ext/Altrep.h>
 #endif
 
-#include <arrow/array.h>
+#include "./r_task_group.h"
 
 namespace arrow {
 namespace r {
 
+namespace altrep {
+
+// specialized altrep vector for when there are some nulls
 template <int sexp_type>
-struct ArrayNoNull {
-  using data_type = typename std::conditional<sexp_type == INTSXP, int, double>::type;
+struct AltrepArrayPrimitive {
   static void DeleteArray(std::shared_ptr<Array>* ptr) { delete ptr; }
   using Pointer = cpp11::external_pointer<std::shared_ptr<Array>, DeleteArray>;
 
-  // altrep object around an Array with no nulls
-  // data1: an external pointer to a shared pointer to the Array
-  // data2: not used
+  static R_altrep_class_t class_t;
 
-  static SEXP Make(R_altrep_class_t class_t, const std::shared_ptr<Array>& array) {
-    // we don't need the whole r6 object, just an external pointer
-    // that retain the array
-    Pointer xp(new std::shared_ptr<Array>(array));
+  using data_type = typename std::conditional<sexp_type == REALSXP, double, int>::type;
+  constexpr static int r_type = sexp_type;
 
-    SEXP res = R_new_altrep(class_t, xp, R_NilValue);
-    MARK_NOT_MUTABLE(res);
+  explicit AltrepArrayPrimitive(SEXP alt)
+      : alt_(alt), array_(*Pointer(R_altrep_data1(alt_))) {}
 
-    return res;
+  explicit AltrepArrayPrimitive(const std::shared_ptr<Array>& array)
+      : alt_(R_new_altrep(class_t, Pointer(new std::shared_ptr<Array>(array)),
+                          R_NilValue)),
+        array_(array) {
+    MARK_NOT_MUTABLE(alt_);
   }
 
-  static Rboolean Inspect(SEXP x, int pre, int deep, int pvec,
-                          void (*inspect_subtree)(SEXP, int, int, int)) {
-    const auto& array = Get(x);
-    Rprintf("arrow::Array<%s, NONULL> len=%d, Array=<%p>\n",
-            array->type()->ToString().c_str(), array->length(), array.get());
-    inspect_subtree(R_altrep_data1(x), pre, deep + 1, pvec);
-    return TRUE;
-  }
+  SEXP data1() { return R_altrep_data1(alt_); }
 
-  static const std::shared_ptr<Array>& Get(SEXP vec) {
-    return *Pointer(R_altrep_data1(vec));
-  }
+  SEXP data2() { return R_altrep_data2(alt_); }
 
-  static R_xlen_t Length(SEXP vec) { return Get(vec)->length(); }
+  R_xlen_t Length() { return array_->length(); }
 
-  static const void* Dataptr_or_null(SEXP vec) {
-    return Get(vec)->data()->template GetValues<data_type>(1);
-  }
+  bool IsMaterialized() { return !Rf_isNull(data2()); }
+
+  void Materialize() {
+    if (!IsMaterialized()) {
+      auto size = array_->length();
 
-  static SEXP Duplicate(SEXP vec, Rboolean) {
-    const auto& array = Get(vec);
-    auto size = array->length();
+      // create a standard R vector
+      SEXP copy = PROTECT(Rf_allocVector(sexp_type, array_->length()));
 
-    SEXP copy = PROTECT(Rf_allocVector(sexp_type, array->length()));
+      // copy the data from the array, through Get_region
+      Get_region(0, size, reinterpret_cast<data_type*>(DATAPTR(copy)));
 
-    memcpy(DATAPTR(copy), Dataptr_or_null(vec), size * sizeof(data_type));
+      // store as data2
+      MARK_NOT_MUTABLE(copy);
 
-    UNPROTECT(1);
-    return copy;
+      R_set_altrep_data2(alt_, copy);
+      UNPROTECT(1);
+    }
   }
 
-  static void* Dataptr(SEXP vec, Rboolean writeable) {
-    return const_cast<void*>(Dataptr_or_null(vec));
+  SEXP Duplicate(Rboolean deep) {
+    Materialize();
+    return Rf_lazy_duplicate(data2());
   }
 
-  // by definition, there are no NA
-  static int No_NA(SEXP vec) { return 1; }
+  Rboolean Inspect(int pre, int deep, int pvec,
+                   void (*inspect_subtree)(SEXP, int, int, int)) {
+    Rprintf("arrow::Array<%s, %d nulls, %s> len=%d, Array=<%p>\n",
+            array_->type()->ToString().c_str(), array_->null_count(),
+            IsMaterialized() ? "materialized" : "not materialized", array_->length(),
+            array_.get());
+    inspect_subtree(data1(), pre, deep + 1, pvec);
+    if (IsMaterialized()) {
+      inspect_subtree(data2(), pre, deep + 1, pvec);
+    }
+    return TRUE;
+  }
+
+  const void* Dataptr_or_null() {
+    if (array_->null_count() == 0) {
+      return reinterpret_cast<const void*>(const_array_data());
+    }
 
-  static void Init(R_altrep_class_t class_t, DllInfo* dll) {
-    // altrep
-    R_set_altrep_Length_method(class_t, ArrayNoNull::Length);
-    R_set_altrep_Inspect_method(class_t, ArrayNoNull::Inspect);
-    R_set_altrep_Duplicate_method(class_t, ArrayNoNull::Duplicate);
+    if (IsMaterialized()) {
+      return DATAPTR_RO(data2());
+    }
 
-    // altvec
-    R_set_altvec_Dataptr_method(class_t, ArrayNoNull::Dataptr);
-    R_set_altvec_Dataptr_or_null_method(class_t, ArrayNoNull::Dataptr_or_null);
+    return NULL;
   }
-};
 
-struct DoubleArrayNoNull {
-  static R_altrep_class_t class_t;
+  // force materialization, and then return data ptr
+  void* Dataptr(Rboolean writeable) {

Review comment:
       Why do we have `Dataptr`, `DATAPTR`, `data1`, `data2`... can you try to make the names clearer and/or add comments explaining the intent?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org