You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2023/06/09 15:02:54 UTC
[arrow-nanoarrow] branch main updated: feat: Include dictionary member in `ArrowArrayView` struct (#221)
This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new c738f90 feat: Include dictionary member in `ArrowArrayView` struct (#221)
c738f90 is described below
commit c738f90e874ec38b2fa2acebd154a33da36dc9a4
Author: Dewey Dunnington <de...@dunnington.ca>
AuthorDate: Fri Jun 9 11:02:49 2023 -0400
feat: Include dictionary member in `ArrowArrayView` struct (#221)
This doesn't add any features really, but ensures that one can walk
`ArrowSchema`, `ArrowArray`, and `ArrowArrayView` recursively using the
same pattern. I'd like to include this in the 0.2 release because it is
very difficult to work around this limitation: for example, if you have
a deeply nested dictionary field, you currently have to walk your whole
tree of arrays twice (once to validate the non-dictionary bits, once
looking for dictionary bits that may or may not exist to validate). The
R package worked around this in some creative but error-prone ways and
I'd like to avoid anybody else attempting creative workarounds when this
is a feature that will almost certainly get added soon.
Another application of this is device array support, since that PR
essentially constructs an `ArrowArrayView` and recursively copies the
buffer view from the array view to a standalone buffer using some
device-specific logic.
---
r/R/array.R | 25 +++-----
r/src/array.c | 33 ++++++++---
r/src/array.h | 2 +-
src/nanoarrow/array.c | 127 +++++++++++++++++++++++++++++++++++++---
src/nanoarrow/array_inline.h | 10 +++-
src/nanoarrow/array_test.cc | 64 ++++++++++++++++++++
src/nanoarrow/nanoarrow.h | 7 ++-
src/nanoarrow/nanoarrow_types.h | 3 +
8 files changed, 235 insertions(+), 36 deletions(-)
diff --git a/r/R/array.R b/r/R/array.R
index 2e36c57..e9e5f6f 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -180,34 +180,24 @@ nanoarrow_array_proxy <- function(array, schema = NULL, recursive = FALSE) {
array_view <- .Call(nanoarrow_c_array_view, array, schema)
result <- .Call(nanoarrow_c_array_proxy, array, array_view, recursive)
- # Pass on some information from the schema if we have it
- if (!is.null(result$dictionary)) {
- nanoarrow_array_set_schema(result$dictionary, schema$dictionary)
- }
-
names(result$children) <- names(schema$children)
if (!recursive) {
+ # Pass on some information from the schema if we have it
result$children <- Map(
nanoarrow_array_set_schema,
result$children,
schema$children
)
+
+ if (!is.null(result$dictionary)) {
+ nanoarrow_array_set_schema(result$dictionary, schema$dictionary)
+ }
}
} else {
result <- .Call(nanoarrow_c_array_proxy, array, NULL, recursive)
}
- # Recursive-ness of the dictionary is handled here because it's not
- # part of the array view
- if (recursive && !is.null(result$dictionary)) {
- result$dictionary <- nanoarrow_array_proxy(
- result$dictionary,
- schema = schema$dictionary,
- recursive = TRUE
- )
- }
-
result
}
@@ -317,9 +307,12 @@ nanoarrow_array_modify <- function(array, new_values, validate = TRUE) {
dictionary = {
if (!is.null(value)) {
value <- as_nanoarrow_array(value)
+ value_copy <- array_shallow_copy(value, validate = validate)
+ } else {
+ value_copy <- NULL
}
- .Call(nanoarrow_c_array_set_dictionary, array_copy, value)
+ .Call(nanoarrow_c_array_set_dictionary, array_copy, value_copy)
if (!is.null(schema) && !is.null(value)) {
schema <- nanoarrow_schema_modify(
diff --git a/r/src/array.c b/r/src/array.c
index f8e839e..145d196 100644
--- a/r/src/array.c
+++ b/r/src/array.c
@@ -215,7 +215,8 @@ SEXP nanoarrow_c_array_set_dictionary(SEXP array_xptr, SEXP dictionary_xptr) {
}
}
- array_export(dictionary_xptr, array->dictionary);
+ struct ArrowArray* dictionary = array_from_xptr(dictionary_xptr);
+ ArrowArrayMove(dictionary, array->dictionary);
}
return R_NilValue;
@@ -249,11 +250,7 @@ static int move_array_buffers(struct ArrowArray* src, struct ArrowArray* dst,
schema->children[i], error));
}
- // ArrowArrayInit() never initializes a dictionary, so do it here
if (src->dictionary != NULL) {
- NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(dst));
- NANOARROW_RETURN_NOT_OK(
- ArrowArrayInitFromSchema(dst->dictionary, schema->dictionary, error));
NANOARROW_RETURN_NOT_OK(
move_array_buffers(src->dictionary, dst->dictionary, schema->dictionary, error));
}
@@ -365,6 +362,15 @@ static SEXP borrow_array_view_child(struct ArrowArrayView* array_view, int64_t i
}
}
+static SEXP borrow_array_view_dictionary(struct ArrowArrayView* array_view,
+ SEXP shelter) {
+ if (array_view != NULL) {
+ return R_MakeExternalPtr(array_view->dictionary, R_NilValue, shelter);
+ } else {
+ return R_NilValue;
+ }
+}
+
static SEXP borrow_unknown_buffer(struct ArrowArray* array, int64_t i, SEXP shelter) {
return buffer_borrowed_xptr(array->buffers[i], 0, shelter);
}
@@ -442,10 +448,21 @@ SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP array_view_xptr, SEXP recursi
UNPROTECT(1);
}
- // The recursive-ness of the dictionary is handled in R because this is not part
- // of the struct ArrowArrayView.
if (array->dictionary != NULL) {
- SET_VECTOR_ELT(array_proxy, 5, borrow_array_xptr(array->dictionary, array_xptr));
+ SEXP dictionary_xptr = PROTECT(borrow_array_xptr(array->dictionary, array_xptr));
+
+ if (recursive) {
+ SEXP dictionary_view_xptr =
+ PROTECT(borrow_array_view_dictionary(array_view, array_view_xptr));
+ SEXP dictionary_proxy = PROTECT(
+ nanoarrow_c_array_proxy(dictionary_xptr, dictionary_view_xptr, recursive_sexp));
+ SET_VECTOR_ELT(array_proxy, 5, dictionary_proxy);
+ UNPROTECT(2);
+ } else {
+ SET_VECTOR_ELT(array_proxy, 5, dictionary_xptr);
+ }
+
+ UNPROTECT(1);
}
UNPROTECT(1);
diff --git a/r/src/array.h b/r/src/array.h
index 18cfa4b..5d0b9ac 100644
--- a/r/src/array.h
+++ b/r/src/array.h
@@ -145,7 +145,7 @@ static inline void array_export(SEXP array_xptr, struct ArrowArray* array_copy)
result = ArrowArrayAllocateChildren(array_copy, array->n_children);
if (result != NANOARROW_OK) {
array_copy->release(array_copy);
- Rf_error("ArrowArraySetBuffer() failed");
+ Rf_error("ArrowArrayAllocateChildren() failed");
}
for (int64_t i = 0; i < array->n_children; i++) {
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index b30f3be..f1524cf 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -168,21 +168,40 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array,
ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array,
struct ArrowArrayView* array_view,
struct ArrowError* error) {
- ArrowArrayInitFromType(array, array_view->storage_type);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowArrayInitFromType(array, array_view->storage_type), error);
+ int result;
+
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
+ private_data->layout = array_view->layout;
- int result = ArrowArrayAllocateChildren(array, array_view->n_children);
- if (result != NANOARROW_OK) {
- array->release(array);
- return result;
+ if (array_view->n_children > 0) {
+ result = ArrowArrayAllocateChildren(array, array_view->n_children);
+ if (result != NANOARROW_OK) {
+ array->release(array);
+ return result;
+ }
+
+ for (int64_t i = 0; i < array_view->n_children; i++) {
+ result =
+ ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error);
+ if (result != NANOARROW_OK) {
+ array->release(array);
+ return result;
+ }
+ }
}
- private_data->layout = array_view->layout;
+ if (array_view->dictionary != NULL) {
+ result = ArrowArrayAllocateDictionary(array);
+ if (result != NANOARROW_OK) {
+ array->release(array);
+ return result;
+ }
- for (int64_t i = 0; i < array_view->n_children; i++) {
- int result =
- ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error);
+ result =
+ ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error);
if (result != NANOARROW_OK) {
array->release(array);
return result;
@@ -321,6 +340,20 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_v
}
}
+ if (array->dictionary != NULL) {
+ result = ArrowArrayViewAllocateDictionary(array_view);
+ if (result != NANOARROW_OK) {
+ ArrowArrayViewReset(array_view);
+ return result;
+ }
+
+ result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary);
+ if (result != NANOARROW_OK) {
+ ArrowArrayViewReset(array_view);
+ return result;
+ }
+ }
+
return NANOARROW_OK;
}
@@ -393,6 +426,10 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) {
NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
}
+ if (array->dictionary != NULL) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary));
+ }
+
return NANOARROW_OK;
}
@@ -407,6 +444,10 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) {
for (int64_t i = 0; i < array->n_children; i++) {
ArrowArrayFlushInternalPointers(array->children[i]);
}
+
+ if (array->dictionary != NULL) {
+ ArrowArrayFlushInternalPointers(array->dictionary);
+ }
}
ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array,
@@ -478,6 +519,21 @@ ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view,
return NANOARROW_OK;
}
+ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) {
+ if (array_view->dictionary != NULL) {
+ return EINVAL;
+ }
+
+ array_view->dictionary =
+ (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView));
+ if (array_view->dictionary == NULL) {
+ return ENOMEM;
+ }
+
+ ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED);
+ return NANOARROW_OK;
+}
+
ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
struct ArrowSchema* schema,
struct ArrowError* error) {
@@ -506,6 +562,21 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
}
}
+ if (schema->dictionary != NULL) {
+ result = ArrowArrayViewAllocateDictionary(array_view);
+ if (result != NANOARROW_OK) {
+ ArrowArrayViewReset(array_view);
+ return result;
+ }
+
+ result =
+ ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error);
+ if (result != NANOARROW_OK) {
+ ArrowArrayViewReset(array_view);
+ return result;
+ }
+ }
+
if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t));
@@ -537,6 +608,11 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) {
ArrowFree(array_view->children);
}
+ if (array_view->dictionary != NULL) {
+ ArrowArrayViewReset(array_view->dictionary);
+ ArrowFree(array_view->dictionary);
+ }
+
if (array_view->union_type_id_map != NULL) {
ArrowFree(array_view->union_type_id_map);
}
@@ -652,6 +728,22 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
array->children[i], error));
}
+ // Check dictionary
+ if (array->dictionary == NULL && array_view->dictionary != NULL) {
+ ArrowErrorSet(error, "Expected dictionary but found NULL");
+ return EINVAL;
+ }
+
+ if (array->dictionary != NULL && array_view->dictionary == NULL) {
+ ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member");
+ return EINVAL;
+ }
+
+ if (array->dictionary != NULL) {
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error));
+ }
+
return NANOARROW_OK;
}
@@ -767,6 +859,11 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
ArrowArrayViewValidateMinimal(array_view->children[i], error));
}
+ // Recurse for dictionary
+ if (array_view->dictionary != NULL) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error));
+ }
+
return NANOARROW_OK;
}
@@ -903,6 +1000,11 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
ArrowArrayViewValidateDefault(array_view->children[i], error));
}
+ // Recurse for dictionary
+ if (array_view->dictionary != NULL) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error));
+ }
+
return NANOARROW_OK;
}
@@ -1047,10 +1149,17 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
}
}
+ // Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error));
}
+ // Dictionary valiation not implemented
+ if (array_view->dictionary != NULL) {
+ ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented");
+ return ENOTSUP;
+ }
+
return NANOARROW_OK;
}
diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h
index dda4634..634d5d1 100644
--- a/src/nanoarrow/array_inline.h
+++ b/src/nanoarrow/array_inline.h
@@ -150,11 +150,15 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array)
}
}
- // Start building any child arrays
+ // Start building any child arrays or dictionaries
for (int64_t i = 0; i < array->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->children[i]));
}
+ if (array->dictionary != NULL) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayStartAppending(array->dictionary));
+ }
+
return NANOARROW_OK;
}
@@ -168,6 +172,10 @@ static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) {
NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->children[i]));
}
+ if (array->dictionary != NULL) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayShrinkToFit(array->dictionary));
+ }
+
return NANOARROW_OK;
}
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 3170efa..d9e0de0 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -1950,6 +1950,70 @@ TEST(ArrayTest, ArrayViewTestFixedSizeListArray) {
array.release(&array);
}
+TEST(ArrayTest, ArrayViewTestDictionary) {
+ struct ArrowSchema schema;
+ struct ArrowArrayView array_view;
+
+ ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_INT32), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaAllocateDictionary(&schema), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaInitFromType(schema.dictionary, NANOARROW_TYPE_STRING),
+ NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+
+ EXPECT_EQ(array_view.storage_type, NANOARROW_TYPE_INT32);
+ ASSERT_NE(array_view.dictionary, nullptr);
+ EXPECT_EQ(array_view.dictionary->storage_type, NANOARROW_TYPE_STRING);
+
+ // Build a dictionary array
+ struct ArrowArray array;
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.dictionary, ArrowCharView("abc")), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.dictionary, ArrowCharView("def")), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(&array, 0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(&array, 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+ EXPECT_EQ(array_view.buffer_views[1].size_bytes, 2 * sizeof(int32_t));
+ EXPECT_EQ(array_view.dictionary->buffer_views[2].size_bytes, 6);
+
+ // Full validation not yet supported for dictionary
+ EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, nullptr),
+ ENOTSUP);
+
+ EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewGetIntUnsafe(&array_view, 1), 1);
+
+ struct ArrowStringView item;
+ item = ArrowArrayViewGetStringUnsafe(array_view.dictionary, 0);
+ EXPECT_EQ(std::string(item.data, item.size_bytes), "abc");
+ item = ArrowArrayViewGetStringUnsafe(array_view.dictionary, 1);
+ EXPECT_EQ(std::string(item.data, item.size_bytes), "def");
+
+ array.release(&array);
+
+ // Setting a non-dictionary array should error
+ struct ArrowError error;
+ ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_INT32), NANOARROW_OK);
+ EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+ EXPECT_STREQ(error.message, "Expected dictionary but found NULL");
+
+ array.release(&array);
+
+ // Setting a dictionary array to a non-dictionary array view should error
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+ ArrowArrayViewReset(&array_view);
+ ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_INT32);
+ EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL);
+ EXPECT_STREQ(error.message, "Expected NULL dictionary but found dictionary member");
+
+ schema.release(&schema);
+ array.release(&array);
+ ArrowArrayViewReset(&array_view);
+}
+
TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
struct ArrowArrayView array_view;
struct ArrowArray array;
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 41570df..372b9dd 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -112,6 +112,8 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromSchema)
#define ArrowArrayViewAllocateChildren \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateChildren)
+#define ArrowArrayViewAllocateDictionary \
+ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewAllocateDictionary)
#define ArrowArrayViewSetLength \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetLength)
#define ArrowArrayViewSetArray \
@@ -939,12 +941,15 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
struct ArrowSchema* schema,
struct ArrowError* error);
-/// \brief Allocate the schema_view->children array
+/// \brief Allocate the array_view->children array
///
/// Includes the memory for each child struct ArrowArrayView
ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view,
int64_t n_children);
+/// \brief Allocate array_view->dictionary
+ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view);
+
/// \brief Set data-independent buffer sizes from length
void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length);
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 79954db..45ee3c6 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -608,6 +608,9 @@ struct ArrowArrayView {
/// \brief Pointers to views of this array's children
struct ArrowArrayView** children;
+ /// \brief Pointer to a view of this array's dictionary
+ struct ArrowArrayView* dictionary;
+
/// \brief Union type id to child index mapping
///
/// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer