You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/10 15:55:31 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #11738: ARROW-14169: [R] altrep for factors

pitrou commented on a change in pull request #11738:
URL: https://github.com/apache/arrow/pull/11738#discussion_r781317335



##########
File path: r/src/altrep.cpp
##########
@@ -366,6 +386,306 @@ struct AltrepVectorPrimitive : public AltrepVectorBase<AltrepVectorPrimitive<sex
 template <int sexp_type>
 R_altrep_class_t AltrepVectorPrimitive<sexp_type>::class_t;
 
+struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
+  // singleton altrep class description
+  static R_altrep_class_t class_t;
+
+  using Base = AltrepVectorBase<AltrepFactor>;
+  using Base::IsMaterialized;
+
+  // redefining because data2 is a paired list with the representation as the
+  // first node: the CAR
+  static SEXP Representation(SEXP alt) { return CAR(R_altrep_data2(alt)); }
+
+  static void SetRepresentation(SEXP alt, SEXP x) { SETCAR(R_altrep_data2(alt), x); }
+
+  // The CADR(data2) is used to store the transposed arrays when unification is needed
+  // In that case we store a vector of Buffers
+  using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+  static bool WasUnified(SEXP alt) { return !Rf_isNull(CADR(R_altrep_data2(alt))); }
+
+  static const std::shared_ptr<Buffer>& GetArrayTransposed(SEXP alt, int i) {
+    const auto& arrays = *Pointer<BufferVector>(CADR(R_altrep_data2(alt)));
+    return (*arrays)[i];
+  }
+
+  static SEXP Make(const std::shared_ptr<ChunkedArray>& chunked_array) {
+    // only dealing with dictionaries of strings
+    if (internal::checked_cast<const DictionaryArray&>(*chunked_array->chunk(0))
+            .dictionary()
+            ->type_id() != Type::STRING) {

Review comment:
       This will crash if there are zero chunks. Shouldn't you simply use `checked_cast<const DictionaryArray&>(*chunked_array->type()).value_type()->id()`?

##########
File path: r/tests/testthat/test-altrep.R
##########
@@ -329,9 +329,31 @@ test_that("ChunkedArray$create(...) keeps Array even when from altrep vectors",
   expect_true(x$chunk(4)$Same(d))
   expect_true(x$chunk(5)$Same(e$chunk(0)))
   expect_true(x$chunk(6)$Same(e$chunk(1)))
+})
+
+test_that("dictionaries chunked arrays are made altrep", {
+  # without unification
+  x <- ChunkedArray$create(
+    factor(c("a", "b")              , levels = letters[1:5]),
+    factor(c("d", "c", "a", NA, "e"), levels = letters[1:5])
+  )
+  f <- x$as_vector()
+  expect_true(is_arrow_altrep(f))
+  expect_equal(levels(f), letters[1:5])
+  expect_equal(as.integer(f), c(1L, 2L, 4L, 3L, 1L, NA_integer_, 5L))
 
+  # with unification
+  x <- ChunkedArray$create(
+    factor(c("a", "b"), levels = c("a", "b")),
+    factor(c("d", "c", "a", NA, "e"), levels = c("d", "c", "a", "e"))
+  )
+  f <- x$as_vector()
+  expect_true(is_arrow_altrep(f))
+  expect_equal(levels(f), c("a", "b", "d", "c", "e"))
+  expect_equal(as.integer(f), c(1L, 2L, 3L, 4L, 1L, NA_integer_, 5L))
 })

Review comment:
       Can we add a test with zero chunks here?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org