You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/12/08 20:00:52 UTC

[arrow-nanoarrow] branch main updated: [C] Add read support for unions in the `ArrowArrayView` (#83)

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 695f08d  [C] Add read support for unions in the `ArrowArrayView` (#83)
695f08d is described below

commit 695f08d7a07a64c0e13eb3444d61b2a6a8d8e5df
Author: Dewey Dunnington <de...@fishandwhistle.net>
AuthorDate: Thu Dec 8 16:00:46 2022 -0400

    [C] Add read support for unions in the `ArrowArrayView` (#83)
    
    * add type id <-> child index map in the array view for unions
    
    * fix mapping and test it
    
    * add child offset calculator
---
 src/nanoarrow/array.c           |  20 +++++
 src/nanoarrow/array_inline.h    |  37 ++++++++-
 src/nanoarrow/array_test.cc     | 161 ++++++++++++++++++++++++++++++++++++++++
 src/nanoarrow/nanoarrow.h       |  10 +++
 src/nanoarrow/nanoarrow_types.h |   7 ++
 5 files changed, 233 insertions(+), 2 deletions(-)

diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c
index 8f06a39..6443ae0 100644
--- a/src/nanoarrow/array.c
+++ b/src/nanoarrow/array.c
@@ -533,6 +533,22 @@ ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
     }
   }
 
+  if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
+      array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
+    array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t));
+    if (array_view->union_type_id_map == NULL) {
+      return ENOMEM;
+    }
+
+    memset(array_view->union_type_id_map, -1, 256);
+    int8_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids,
+                                                array_view->union_type_id_map + 128);
+    for (int8_t child_index = 0; child_index < n_type_ids; child_index++) {
+      int8_t type_id = array_view->union_type_id_map[128 + child_index];
+      array_view->union_type_id_map[type_id] = child_index;
+    }
+  }
+
   return NANOARROW_OK;
 }
 
@@ -548,6 +564,10 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) {
     ArrowFree(array_view->children);
   }
 
+  if (array_view->union_type_id_map != NULL) {
+    ArrowFree(array_view->union_type_id_map);
+  }
+
   ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED);
 }
 
diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h
index f933d95..d2517cc 100644
--- a/src/nanoarrow/array_inline.h
+++ b/src/nanoarrow/array_inline.h
@@ -619,13 +619,46 @@ static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int
       return 0x01;
     case NANOARROW_TYPE_DENSE_UNION:
     case NANOARROW_TYPE_SPARSE_UNION:
-      // Not supported yet
-      return -1;
+      // Unions are "never null" in Arrow land
+      return 0x00;
     default:
       return validity_buffer != NULL && !ArrowBitGet(validity_buffer, i);
   }
 }
 
+static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view,
+                                               int64_t i) {
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_DENSE_UNION:
+    case NANOARROW_TYPE_SPARSE_UNION:
+      return array_view->buffer_views[0].data.as_int8[i];
+    default:
+      return -1;
+  }
+}
+
+static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view,
+                                                   int64_t i) {
+  int8_t type_id = ArrowArrayViewUnionTypeId(array_view, i);
+  if (array_view->union_type_id_map == NULL) {
+    return type_id;
+  } else {
+    return array_view->union_type_id_map[type_id];
+  }
+}
+
+static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view,
+                                                     int64_t i) {
+  switch (array_view->storage_type) {
+    case NANOARROW_TYPE_DENSE_UNION:
+      return array_view->buffer_views[1].data.as_int32[i];
+    case NANOARROW_TYPE_SPARSE_UNION:
+      return i;
+    default:
+      return -1;
+  }
+}
+
 static inline int64_t ArrowArrayViewGetIntUnsafe(struct ArrowArrayView* array_view,
                                                  int64_t i) {
   struct ArrowBufferView* data_view = &array_view->buffer_views[1];
diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc
index 8959657..a028b10 100644
--- a/src/nanoarrow/array_test.cc
+++ b/src/nanoarrow/array_test.cc
@@ -1594,6 +1594,167 @@ TEST(ArrayTest, ArrayViewTestFixedSizeListArray) {
   array.release(&array);
 }
 
+TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
+  struct ArrowArrayView array_view;
+  struct ArrowArray array;
+  struct ArrowSchema schema;
+
+  // Build a simple union with one int and one string
+  ArrowSchemaInit(&schema);
+  ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_DENSE_UNION, 2),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
+
+  ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one twenty four")),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+  // The ArrayView for a union could in theroy be created without a schema,
+  // in which case the type_ids are assumed to equal child indices
+  ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_DENSE_UNION);
+  ASSERT_EQ(ArrowArrayViewAllocateChildren(&array_view, 2), NANOARROW_OK);
+  ArrowArrayViewInitFromType(array_view.children[0], NANOARROW_TYPE_INT32);
+  ArrowArrayViewInitFromType(array_view.children[1], NANOARROW_TYPE_STRING);
+  ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+
+  ArrowArrayViewReset(&array_view);
+
+  // The test schema explicitly sets the type_ids 0,1 and this should work too
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+
+  ArrowArrayViewReset(&array_view);
+
+  // Reversing the type ids should result in the same type ids but
+  // reversed child indices
+  ASSERT_EQ(ArrowSchemaSetFormat(&schema, "+ud:1,0"), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+  EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionTypeId(&array_view, 1), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 0);
+
+  ArrowArrayViewReset(&array_view);
+
+  // Check the raw mapping in the array view for numbers that are easier to check
+  ASSERT_EQ(ArrowSchemaSetFormat(&schema, "+ud:6,2"), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+  EXPECT_EQ(array_view.union_type_id_map[6], 0);
+  EXPECT_EQ(array_view.union_type_id_map[2], 1);
+  EXPECT_EQ(array_view.union_type_id_map[128 + 0], 6);
+  EXPECT_EQ(array_view.union_type_id_map[128 + 1], 2);
+
+  ArrowArrayViewReset(&array_view);
+  schema.release(&schema);
+  array.release(&array);
+}
+
+TEST(ArrayTest, ArrayViewTestDenseUnionGet) {
+  struct ArrowArrayView array_view;
+  struct ArrowArray array;
+  struct ArrowSchema schema;
+
+  // Build a simple union with one int and one string and one null int
+  ArrowSchemaInit(&schema);
+  ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_DENSE_UNION, 2),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
+
+  ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one twenty four")),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+  // Initialize the array view
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+  // Check the values that will be used to index into children
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 2), 0);
+
+  // Check the values that will be used to index into the child arrays
+  EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 1), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 2), 1);
+
+  // Union elements are "never null" (even if the corresponding child element is)
+  EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), NANOARROW_OK);
+
+  ArrowArrayViewReset(&array_view);
+  schema.release(&schema);
+  array.release(&array);
+}
+
+TEST(ArrayTest, ArrayViewTestSparseUnionGet) {
+  struct ArrowArrayView array_view;
+  struct ArrowArray array;
+  struct ArrowSchema schema;
+
+  // Build a simple union with one int and one string and one null int
+  ArrowSchemaInit(&schema);
+  ASSERT_EQ(ArrowSchemaSetTypeUnion(&schema, NANOARROW_TYPE_SPARSE_UNION, 2),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
+  ASSERT_EQ(ArrowSchemaSetType(schema.children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
+
+  ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendString(array.children[1], ArrowCharView("one twenty four")),
+            NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayFinishBuilding(&array, nullptr), NANOARROW_OK);
+
+  // Initialize the array view
+  ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, nullptr), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, nullptr), NANOARROW_OK);
+
+  // Check the values that will be used to index into children
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 1), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildIndex(&array_view, 2), 0);
+
+  // Check the values that will be used to index into the child arrays
+  EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 0), 0);
+  EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 1), 1);
+  EXPECT_EQ(ArrowArrayViewUnionChildOffset(&array_view, 2), 2);
+
+  // Union elements are "never null" (even if the corresponding child element is)
+  EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), NANOARROW_OK);
+
+  ArrowArrayViewReset(&array_view);
+  schema.release(&schema);
+  array.release(&array);
+}
+
 template <typename TypeClass>
 void TestGetFromNumericArrayView() {
   struct ArrowArray array;
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index e19fb8f..daa9c32 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -905,6 +905,16 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view);
 /// \brief Check for a null element in an ArrowArrayView
 static inline int8_t ArrowArrayViewIsNull(struct ArrowArrayView* array_view, int64_t i);
 
+/// \brief Get the type id of a union array element
+static inline int8_t ArrowArrayViewUnionTypeId(struct ArrowArrayView* array_view, int64_t i);
+
+/// \brief Get the child index of a union array element
+static inline int8_t ArrowArrayViewUnionChildIndex(struct ArrowArrayView* array_view, int64_t i);
+
+/// \brief Get the index to use into the relevant union child array
+static inline int64_t ArrowArrayViewUnionChildOffset(struct ArrowArrayView* array_view,
+                                                     int64_t i);
+
 /// \brief Get an element in an ArrowArrayView as an integer
 ///
 /// This function does not check for null values, that values are actually integers, or
diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h
index 56b0369..291cfba 100644
--- a/src/nanoarrow/nanoarrow_types.h
+++ b/src/nanoarrow/nanoarrow_types.h
@@ -498,6 +498,13 @@ struct ArrowArrayView {
 
   /// \brief Pointers to views of this array's children
   struct ArrowArrayView** children;
+
+  /// \brief Union type id to child index mapping
+  ///
+  /// If storage_type is a union type, a 256-byte ArrowMalloc()ed buffer
+  /// such that child_index == union_type_id_map[type_id] and
+  /// type_id == union_type_id_map[128 + child_index]
+  int8_t* union_type_id_map;
 };
 
 // Used as the private data member for ArrowArrays allocated here and accessed