You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/12/08 20:00:52 UTC
[arrow-nanoarrow] branch main updated: [C] Add read support for unions in the `ArrowArrayView` (#83)
This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 695f08d [C] Add read support for unions in the `ArrowArrayView` (#83)
695f08d is described below
commit 695f08d7a07a64c0e13eb3444d61b2a6a8d8e5df
Author: Dewey Dunnington <de...@fishandwhistle.net>
AuthorDate: Thu Dec 8 16:00:46 2022 -0400
[C] Add read support for unions in the `ArrowArrayView` (#83)
* add type id <-> child index map in the array view for unions
* fix mapping and test it
* add child offset calculator
---
src/nanoarrow/array.c | 20 +++++
src/nanoarrow/array_inline.h | 37 ++++++++-
src/nanoarrow/array_test.cc | 161 ++++++++++++++++++++++++++++++++++++++++
src/nanoarrow/nanoarrow.h | 10 +++
src/nanoarrow/nanoarrow_types.h | 7 ++
5 files changed, 233 insertions(+), 2 deletions(-)
diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 8f06a39..6443ae0 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -533,6 +533,22 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
}
}
+ if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
+ array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
+ array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t));
+ if (array_view->union_type_id_map == NULL) {
+ return ENOMEM;
+ }
+
+ memset(array_view->union_type_id_map, -1, 256);
+ int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids,
+ array_view->union_type_id_map + 128);
+ for (int8_t child_index = 0; child_index < n_type_ids; child_index++) {
+ int8_t type_id = array_view->union_type_id_map[128 + child_index];
+ array_view->union_type_id_map[type_id] = child_index;
+ }
+ }
+
return NANOARROW_OK;
}
@@ -548,6 +564,10 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) {
ArrowFree(array_view->children);
}
+ if (array_view->union_type_id_map != NULL) {
+ ArrowFree(array_view->union_type_id_map);
+ }
+
ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED);
}
diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h
index f933d95..d2517cc 100644
--- a/src/nanoarrow/array_inline.h
+++ b/src/nanoarrow/array_inline.h
@@ -619,13 +619,46 @@ static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int
return 0x01;
case NANOARROW_TYPE_DENSE_UNION:
case NANOARROW_TYPE_SPARSE_UNION:
- // Not supported yet
- return -1;
+ // Unions are "never null" in Arrow land
+ return 0x00;
default:
return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i);
}
}
+static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view,
+ int64_t i) {
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_DENSE_UNION:
+ case NANOARROW_TYPE_SPARSE_UNION:
+ return array_view->buffer_views[0].data.as_int8[i];
+ default:
+ return -1;
+ }
+}
+
+static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view,
+ int64_t i) {
+ int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i);
+ if (array_view->union_type_id_map == NULL) {
+ return type_id;
+ } else {
+ return array_view->union_type_id_map[type_id];
+ }
+}
+
+static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view,
+ int64_t i) {
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_DENSE_UNION:
+ return array_view->buffer_views[1].data.as_int32[i];
+ case NANOARROW_TYPE_SPARSE_UNION:
+ return i;
+ default:
+ return -1;
+ }
+}
+
static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view,
int64_t i) {
struct ArrowBufferView* data_view = &array_view->buffer_views[1];
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 8959657..a028b10 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -1594,6 +1594,167 @@ TEST(ArrayTest, ArrayViewTestFixedSizeListArray) {
array.release(&array);
}
+TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
+ struct ArrowArrayView array_view;
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+
+ // Build a simple union with one int and one string
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_DENSE_UNION, 2),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one twenty four")),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+ // The ArrayView for a union could in theroy be created without a schema,
+ // in which case the type_ids are assumed to equal child indices
+ ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_DENSE_UNION);
+ ASSERT_EQ(ArrowArrayViewAllocateChildren(&array_view, 2), NANOARROW_OK);
+ ArrowArrayViewInitFromType(array_view.children[0], NANOARROW_TYPE_INT32);
+ ArrowArrayViewInitFromType(array_view.children[1], NANOARROW_TYPE_STRING);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+ EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+
+ ArrowArrayViewReset(&array_view);
+
+ // The test schema explicitly sets the type_ids 0,1 and this should work too
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+ EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+
+ ArrowArrayViewReset(&array_view);
+
+ // Reversing the type ids should result in the same type ids but
+ // reversed child indices
+ ASSERT_EQ(ArrowSchemaSetFormat(&schema, "+ud:1,0"), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+ EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 0);
+
+ ArrowArrayViewReset(&array_view);
+
+ // Check the raw mapping in the array view for numbers that are easier to check
+ ASSERT_EQ(ArrowSchemaSetFormat(&schema, "+ud:6,2"), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+ EXPECT_EQ(array_view.union_type_id_map[6], 0);
+ EXPECT_EQ(array_view.union_type_id_map[2], 1);
+ EXPECT_EQ(array_view.union_type_id_map[128 + 0], 6);
+ EXPECT_EQ(array_view.union_type_id_map[128 + 1], 2);
+
+ ArrowArrayViewReset(&array_view);
+ schema.release(&schema);
+ array.release(&array);
+}
+
+TEST(ArrayTest, ArrayViewTestDenseUnionGet) {
+ struct ArrowArrayView array_view;
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+
+ // Build a simple union with one int and one string and one null int
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_DENSE_UNION, 2),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one twenty four")),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+ // Initialize the array view
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+ // Check the values that will be used to index into children
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 2), 0);
+
+ // Check the values that will be used to index into the child arrays
+ EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 1), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 2), 1);
+
+ // Union elements are "never null" (even if the corresponding child element is)
+ EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), NANOARROW_OK);
+
+ ArrowArrayViewReset(&array_view);
+ schema.release(&schema);
+ array.release(&array);
+}
+
+TEST(ArrayTest, ArrayViewTestSparseUnionGet) {
+ struct ArrowArrayView array_view;
+ struct ArrowArray array;
+ struct ArrowSchema schema;
+
+ // Build a simple union with one int and one string and one null int
+ ArrowSchemaInit(&schema);
+ ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_SPARSE_UNION, 2),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
+ ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
+
+ ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one twenty four")),
+ NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+ // Initialize the array view
+ ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+ ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+ // Check the values that will be used to index into children
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 2), 0);
+
+ // Check the values that will be used to index into the child arrays
+ EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 0), 0);
+ EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 1), 1);
+ EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 2), 2);
+
+ // Union elements are "never null" (even if the corresponding child element is)
+ EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), NANOARROW_OK);
+
+ ArrowArrayViewReset(&array_view);
+ schema.release(&schema);
+ array.release(&array);
+}
+
template <typename TypeClass>
void TestGetFromNumericArrayView() {
struct ArrowArray array;
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index e19fb8f..daa9c32 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -905,6 +905,16 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view);
/// \brief Check for a null element in an ArrowArrayView
static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i);
+/// \brief Get the type id of a union array element
+static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, int64_t i);
+
+/// \brief Get the child index of a union array element
+static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, int64_t i);
+
+/// \brief Get the index to use into the relevant union child array
+static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view,
+ int64_t i);
+
/// \brief Get an element in an ArrowArrayView as an integer
///
/// This function does not check for null values, that values are actually integers, or
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 56b0369..291cfba 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -498,6 +498,13 @@ struct ArrowArrayView {
/// \brief Pointers to views of this array's children
struct ArrowArrayView** children;
+
+ /// \brief Union type id to child index mapping
+ ///
+ /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer
+ /// such that child_index == union_type_id_map[type_id] and
+ /// type_id == union_type_id_map[128 + child_index]
+ int8_t* union_type_id_map;
};
// Used as the private data member for ArrowArrays allocated here and accessed