You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2023/06/09 15:02:54 UTC

[arrow-nanoarrow] branch main updated: feat: Include dictionary member in `ArrowArrayView` struct (#221)

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c738f90  feat: Include dictionary member in `ArrowArrayView` struct (#221)
c738f90 is described below

commit c738f90e874ec38b2fa2acebd154a33da36dc9a4
Author: Dewey Dunnington <de...@dunnington.ca>
AuthorDate: Fri Jun 9 11:02:49 2023 -0400

    feat: Include dictionary member in `ArrowArrayView` struct (#221)
    
    This doesn't add any features really, but ensures that one can walk
    `ArrowSchema`, `ArrowArray`, and `ArrowArrayView` recursively using the
    same pattern. I'd like to include this in the 0.2 release because it is
    very difficult to work around this limitation: for example, if you have
    a deeply nested dictionary field, you currently have to walk your whole
    tree of arrays twice (once to validate the non-dictionary bits, once
    looking for dictionary bits that may or may not exist to validate). The
    R package worked around this in some creative but error-prone ways and
    I'd like to avoid anybody else attempting creative workarounds when this
    is a feature that will almost certainly get added soon.
    
    Another application of this is device array support, since that PR
    essentially constructs an `ArrowArrayView` and recursively copies the
    buffer view from the array view to a standalone buffer using some
    device-specific logic.
---
 r/R/array.R                     |  25 +++-----
 r/src/array.c                   |  33 ++++++++---
 r/src/array.h                   |   2 +-
 src/nanoarrow/array.c           | 127 +++++++++++++++++++++++++++++++++++++---
 src/nanoarrow/array_inline.h    |  10 +++-
 src/nanoarrow/array_test.cc     |  64 ++++++++++++++++++++
 src/nanoarrow/nanoarrow.h       |   7 ++-
 src/nanoarrow/nanoarrow_types.h |   3 +
 8 files changed, 235 insertions(+), 36 deletions(-)

diff --git a/r/R/array.R b/r/R/array.R
index 2e36c57..e9e5f6f 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -180,34 +180,24 @@ nanoarrow_array_proxy <- function(array, schema = NULL, recursive = FALSE) {
     array_view <- .Call(nanoarrow_c_array_view, array, schema)
     result <- .Call(nanoarrow_c_array_proxy, array, array_view, recursive)
 
-    # Pass on some information from the schema if we have it
-    if (!is.null(result$dictionary)) {
-      nanoarrow_array_set_schema(result$dictionary, schema$dictionary)
-    }
-
     names(result$children) <- names(schema$children)
 
     if (!recursive) {
+      # Pass on some information from the schema if we have it
       result$children <- Map(
         nanoarrow_array_set_schema,
         result$children,
         schema$children
       )
+
+      if (!is.null(result$dictionary)) {
+        nanoarrow_array_set_schema(result$dictionary, schema$dictionary)
+      }
     }
   } else {
     result <- .Call(nanoarrow_c_array_proxy, array, NULL, recursive)
   }
 
-  # Recursive-ness of the dictionary is handled here because it's not
-  # part of the array view
-  if (recursive && !is.null(result$dictionary)) {
-    result$dictionary <- nanoarrow_array_proxy(
-      result$dictionary,
-      schema = schema$dictionary,
-      recursive = TRUE
-    )
-  }
-
   result
 }
 
@@ -317,9 +307,12 @@ nanoarrow_array_modify <- function(array, new_values, validate = TRUE) {
       dictionary = {
         if (!is.null(value)) {
           value <- as_nanoarrow_array(value)
+          value_copy <- array_shallow_copy(value, validate = validate)
+        } else {
+          value_copy <- NULL
         }
 
-        .Call(nanoarrow_c_array_set_dictionary, array_copy, value)
+        .Call(nanoarrow_c_array_set_dictionary, array_copy, value_copy)
 
         if (!is.null(schema) && !is.null(value)) {
           schema <- nanoarrow_schema_modify(
diff --git a/r/src/array.c b/r/src/array.c
index f8e839e..145d196 100644
--- a/r/src/array.c
+++ b/r/src/array.c
@@ -215,7 +215,8 @@ SEXP nanoarrow_c_array_set_dictionary(SEXP array_xptr, SEXP dictionary_xptr) {
       }
     }
 
-    array_export(dictionary_xptr, array->dictionary);
+    struct ArrowArray* dictionary = array_from_xptr(dictionary_xptr);
+    ArrowArrayMove(dictionary, array->dictionary);
   }
 
   return R_NilValue;
@@ -249,11 +250,7 @@ static int move_array_buffers(struct ArrowArray* src, struct ArrowArray* dst,
                                                schema->children[i], error));
   }
 
-  // ArrowArrayInit() never initializes a dictionary, so do it here
   if (src->dictionary != NULL) {
-    NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(dst));
-    NANOARROW_RETURN_NOT_OK(
-        ArrowArrayInitFromSchema(dst->dictionary, schema->dictionary, error));
     NANOARROW_RETURN_NOT_OK(
         move_array_buffers(src->dictionary, dst->dictionary, schema->dictionary, error));
   }
@@ -365,6 +362,15 @@ static SEXP borrow_array_view_child(struct ArrowArrayView* array_view, int64_t i
   }
 }
 
+static SEXP borrow_array_view_dictionary(struct ArrowArrayView* array_view,
+                                         SEXP shelter) {
+  if (array_view != NULL) {
+    return R_MakeExternalPtr(array_view->dictionary, R_NilValue, shelter);
+  } else {
+    return R_NilValue;
+  }
+}
+
 static SEXP borrow_unknown_buffer(struct ArrowArray* array, int64_t i, SEXP shelter) {
   return buffer_borrowed_xptr(array->buffers[i], 0, shelter);
 }
@@ -442,10 +448,21 @@ SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP array_view_xptr, SEXP recursi
     UNPROTECT(1);
   }
 
-  // The recursive-ness of the dictionary is handled in R because this is not part
-  // of the struct ArrowArrayView.
   if (array->dictionary != NULL) {
-    SET_VECTOR_ELT(array_proxy, 5, borrow_array_xptr(array->dictionary, array_xptr));
+    SEXP dictionary_xptr = PROTECT(borrow_array_xptr(array->dictionary, array_xptr));
+
+    if (recursive) {
+      SEXP dictionary_view_xptr =
+        PROTECT(borrow_array_view_dictionary(array_view, array_view_xptr));
+      SEXP dictionary_proxy = PROTECT(
+          nanoarrow_c_array_proxy(dictionary_xptr, dictionary_view_xptr, recursive_sexp));
+      SET_VECTOR_ELT(array_proxy, 5, dictionary_proxy);
+      UNPROTECT(2);
+    } else {
+      SET_VECTOR_ELT(array_proxy, 5, dictionary_xptr);
+    }
+
+    UNPROTECT(1);
   }
 
   UNPROTECT(1);
diff --git a/r/src/array.h b/r/src/array.h
index 18cfa4b..5d0b9ac 100644
--- a/r/src/array.h
+++ b/r/src/array.h
@@ -145,7 +145,7 @@ static inline void array_export(SEXP array_xptr, struct ArrowArray* array_copy)
   result = ArrowArrayAllocateChildren(array_copy, array->n_children);
   if (result != NANOARROW_OK) {
     array_copy->release(array_copy);
-    Rf_error("ArrowArraySetBuffer() failed");
+    Rf_error("ArrowArrayAllocateChildren() failed");
   }
 
   for (int64_t i = 0; i < array->n_children; i++) {
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index b30f3be..f1524cf 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -168,21 +168,40 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array,
 ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array,
                                            struct ArrowArrayView* array_view,
                                            struct ArrowError* error) {
-  ArrowArrayInitFromType(array, array_view->storage_type);
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowArrayInitFromType(array, array_view->storage_type), error);
+  int result;
+
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
+  private_data->layout = array_view->layout;
 
-  int result = ArrowArrayAllocateChildren(array, array_view->n_children);
-  if (result != NANOARROW_OK) {
-    array->release(array);
-    return result;
+  if (array_view->n_children > 0) {
+    result = ArrowArrayAllocateChildren(array, array_view->n_children);
+    if (result != NANOARROW_OK) {
+      array->release(array);
+      return result;
+    }
+
+    for (int64_t i = 0; i < array_view->n_children; i++) {
+      result =
+          ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error);
+      if (result != NANOARROW_OK) {
+        array->release(array);
+        return result;
+      }
+    }
   }
 
-  private_data->layout = array_view->layout;
+  if (array_view->dictionary != NULL) {
+    result = ArrowArrayAllocateDictionary(array);
+    if (result != NANOARROW_OK) {
+      array->release(array);
+      return result;
+    }
 
-  for (int64_t i = 0; i < array_view->n_children; i++) {
-    int result =
-        ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error);
+    result =
+        ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error);
     if (result != NANOARROW_OK) {
       array->release(array);
       return result;
@@ -321,6 +340,20 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v
     }
   }
 
+  if (array->dictionary != NULL) {
+    result = ArrowArrayViewAllocateDictionary(array_view);
+    if (result != NANOARROW_OK) {
+      ArrowArrayViewReset(array_view);
+      return result;
+    }
+
+    result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary);
+    if (result != NANOARROW_OK) {
+      ArrowArrayViewReset(array_view);
+      return result;
+    }
+  }
+
   return NANOARROW_OK;
 }
 
@@ -393,6 +426,10 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) {
     NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
   }
 
+  if (array->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary));
+  }
+
   return NANOARROW_OK;
 }
 
@@ -407,6 +444,10 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) {
   for (int64_t i = 0; i < array->n_children; i++) {
     ArrowArrayFlushInternalPointers(array->children[i]);
   }
+
+  if (array->dictionary != NULL) {
+    ArrowArrayFlushInternalPointers(array->dictionary);
+  }
 }
 
 ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array,
@@ -478,6 +519,21 @@ ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view,
   return NANOARROW_OK;
 }
 
+ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) {
+  if (array_view->dictionary != NULL) {
+    return EINVAL;
+  }
+
+  array_view->dictionary =
+      (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView));
+  if (array_view->dictionary == NULL) {
+    return ENOMEM;
+  }
+
+  ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED);
+  return NANOARROW_OK;
+}
+
 ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
                                             struct ArrowSchema* schema,
                                             struct ArrowError* error) {
@@ -506,6 +562,21 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
     }
   }
 
+  if (schema->dictionary != NULL) {
+    result = ArrowArrayViewAllocateDictionary(array_view);
+    if (result != NANOARROW_OK) {
+      ArrowArrayViewReset(array_view);
+      return result;
+    }
+
+    result =
+        ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error);
+    if (result != NANOARROW_OK) {
+      ArrowArrayViewReset(array_view);
+      return result;
+    }
+  }
+
   if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
       array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
     array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t));
@@ -537,6 +608,11 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) {
     ArrowFree(array_view->children);
   }
 
+  if (array_view->dictionary != NULL) {
+    ArrowArrayViewReset(array_view->dictionary);
+    ArrowFree(array_view->dictionary);
+  }
+
   if (array_view->union_type_id_map != NULL) {
     ArrowFree(array_view->union_type_id_map);
   }
@@ -652,6 +728,22 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
                                                            array->children[i], error));
   }
 
+  // Check dictionary
+  if (array->dictionary == NULL && array_view->dictionary != NULL) {
+    ArrowErrorSet(error, "Expected dictionary but found NULL");
+    return EINVAL;
+  }
+
+  if (array->dictionary != NULL && array_view->dictionary == NULL) {
+    ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member");
+    return EINVAL;
+  }
+
+  if (array->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(
+        ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error));
+  }
+
   return NANOARROW_OK;
 }
 
@@ -767,6 +859,11 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
         ArrowArrayViewValidateMinimal(array_view->children[i], error));
   }
 
+  // Recurse for dictionary
+  if (array_view->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error));
+  }
+
   return NANOARROW_OK;
 }
 
@@ -903,6 +1000,11 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
         ArrowArrayViewValidateDefault(array_view->children[i], error));
   }
 
+  // Recurse for dictionary
+  if (array_view->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error));
+  }
+
   return NANOARROW_OK;
 }
 
@@ -1047,10 +1149,17 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
     }
   }
 
+  // Recurse for children
   for (int64_t i = 0; i < array_view->n_children; i++) {
     NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error));
   }
 
+  // Dictionary valiation not implemented
+  if (array_view->dictionary != NULL) {
+    ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented");
+    return ENOTSUP;
+  }
+
   return NANOARROW_OK;
 }
 
diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h
index dda4634..634d5d1 100644
--- a/src/nanoarrow/array_inline.h
+++ b/src/nanoarrow/array_inline.h
@@ -150,11 +150,15 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array)
     }
   }
 
-  // Start building any child arrays
+  // Start building any child arrays or dictionaries
   for (int64_t i = 0; i < array->n_children; i++) {
     NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i]));
   }
 
+  if (array->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary));
+  }
+
   return NANOARROW_OK;
 }
 
@@ -168,6 +172,10 @@ static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) {
     NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i]));
   }
 
+  if (array->dictionary != NULL) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary));
+  }
+
   return NANOARROW_OK;
 }
 
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 3170efa..d9e0de0 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -1950,6 +1950,70 @@ TEST(ArrayTest, ArrayViewTestFixedSizeListArray) {
   array.release(&array);
 }
 
+TEST(ArrayTest, ArrayViewTestDictionary) {
+  struct ArrowSchema schema;
+  struct ArrowArrayView array_view;
+
+  ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_INT32), NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaAllocateDictionary(&schema), NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaInitFromType(schema.dictionary, NANOARROW_TYPE_STRING),
+            NANOARROW_OK);
+
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(array_view.storage_type, NANOARROW_TYPE_INT32);
+  ASSERT_NE(array_view.dictionary, nullptr);
+  EXPECT_EQ(array_view.dictionary->storage_type, NANOARROW_TYPE_STRING);
+
+  // Build a dictionary array
+  struct ArrowArray array;
+  ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendString(array.dictionary, ArrowCharView("abc")), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendString(array.dictionary, ArrowCharView("def")), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(&array, 0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+  ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+  EXPECT_EQ(array_view.buffer_views[1].size_bytes, 2 * sizeof(int32_t));
+  EXPECT_EQ(array_view.dictionary->buffer_views[2].size_bytes, 6);
+
+  // Full validation not yet supported for dictionary
+  EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, nullptr),
+            ENOTSUP);
+
+  EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 1), 1);
+
+  struct ArrowStringView item;
+  item = ArrowArrayViewGetStringUnsafe(array_view.dictionary, 0);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "abc");
+  item = ArrowArrayViewGetStringUnsafe(array_view.dictionary, 1);
+  EXPECT_EQ(std::string(item.data, item.size_bytes), "def");
+
+  array.release(&array);
+
+  // Setting a non-dictionary array should error
+  struct ArrowError error;
+  ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_INT32), NANOARROW_OK);
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected dictionary but found NULL");
+
+  array.release(&array);
+
+  // Setting a dictionary array to a non-dictionary array view should error
+  ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+  ArrowArrayViewReset(&array_view);
+  ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_INT32);
+  EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+  EXPECT_STREQ(error.message, "Expected NULL dictionary but found dictionary member");
+
+  schema.release(&schema);
+  array.release(&array);
+  ArrowArrayViewReset(&array_view);
+}
+
 TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
   struct ArrowArrayView array_view;
   struct ArrowArray array;
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 41570df..372b9dd 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -112,6 +112,8 @@
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema)
 #define ArrowArrayViewAllocateChildren \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren)
+#define ArrowArrayViewAllocateDictionary \
+  NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary)
 #define ArrowArrayViewSetLength \
   NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength)
 #define ArrowArrayViewSetArray \
@@ -939,12 +941,15 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
                                             struct ArrowSchema* schema,
                                             struct ArrowError* error);
 
-/// \brief Allocate the schema_view->children array
+/// \brief Allocate the array_view->children array
 ///
 /// Includes the memory for each child struct ArrowArrayView
 ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view,
                                               int64_t n_children);
 
+/// \brief Allocate array_view->dictionary
+ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view);
+
 /// \brief Set data-independent buffer sizes from length
 void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length);
 
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 79954db..45ee3c6 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -608,6 +608,9 @@ struct ArrowArrayView {
   /// \brief Pointers to views of this array's children
   struct ArrowArrayView** children;
 
+  /// \brief Pointer to a view of this array's dictionary
+  struct ArrowArrayView* dictionary;
+
   /// \brief Union type id to child index mapping
   ///
   /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer