You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by gi...@apache.org on 2022/12/07 20:44:37 UTC

[arrow-nanoarrow] branch main updated: Update dist/ for commit ec0824dcf064448648cc6eb5ef7873962fa23339

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 9af8efd  Update dist/ for commit ec0824dcf064448648cc6eb5ef7873962fa23339
9af8efd is described below

commit 9af8efdea104a966c9e059cbe0101f4fcab889d0
Author: GitHub Actions <ac...@github.com>
AuthorDate: Wed Dec 7 20:44:32 2022 +0000

    Update dist/ for commit ec0824dcf064448648cc6eb5ef7873962fa23339
---
 dist/nanoarrow.c |  52 ++++++++----
 dist/nanoarrow.h | 242 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 250 insertions(+), 44 deletions(-)

diff --git a/dist/nanoarrow.c b/dist/nanoarrow.c
index 7965404..d2b3c59 100644
--- a/dist/nanoarrow.c
+++ b/dist/nanoarrow.c
@@ -956,8 +956,17 @@ static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view,
           }
 
           if (format[3] == ':') {
-            schema_view->union_type_ids.data = format + 4;
-            schema_view->union_type_ids.n_bytes = strlen(format + 4);
+            schema_view->union_type_ids = format + 4;
+            int64_t n_type_ids =
+                _ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL);
+            if (n_type_ids != schema_view->schema->n_children) {
+              ArrowErrorSet(
+                  error,
+                  "Expected union type_ids parameter to be a comma-separated list of %ld "
+                  "values between 0 and 127 but found '%s'",
+                  (long)schema_view->schema->n_children, schema_view->union_type_ids);
+              return EINVAL;
+            }
             *format_end_out = format + strlen(format);
             return NANOARROW_OK;
           } else {
@@ -1321,9 +1330,12 @@ ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view,
       ArrowSchemaViewParse(schema_view, format, &format_end_out, error);
 
   if (result != NANOARROW_OK) {
-    char child_error[1024];
-    memcpy(child_error, ArrowErrorMessage(error), 1024);
-    ArrowErrorSet(error, "Error parsing schema->format: %s", child_error);
+    if (error != NULL) {
+      char child_error[1024];
+      memcpy(child_error, ArrowErrorMessage(error), 1024);
+      ArrowErrorSet(error, "Error parsing schema->format: %s", child_error);
+    }
+    
     return result;
   }
 
@@ -1389,9 +1401,7 @@ static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_vi
       return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size);
     case NANOARROW_TYPE_SPARSE_UNION:
     case NANOARROW_TYPE_DENSE_UNION:
-      return snprintf(out, n, "%s([%.*s])", type_string,
-                      (int)schema_view->union_type_ids.n_bytes,
-                      schema_view->union_type_ids.data);
+      return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids);
     default:
       return snprintf(out, n, "%s", type_string);
   }
@@ -1882,12 +1892,15 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array,
   }
 
   ArrowLayoutInit(&private_data->layout, storage_type);
+  // We can only know this not to be true when initializing based on a schema
+  // so assume this to be true.
+  private_data->union_type_id_is_child_index = 1;
   return NANOARROW_OK;
 }
 
-static ArrowErrorCode ArrowArrayInitFromTypeFromArrayView(
-    struct ArrowArray* array, struct ArrowArrayView* array_view,
-    struct ArrowError* error) {
+static ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array,
+                                                  struct ArrowArrayView* array_view,
+                                                  struct ArrowError* error) {
   ArrowArrayInitFromType(array, array_view->storage_type);
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
@@ -1901,8 +1914,8 @@ static ArrowErrorCode ArrowArrayInitFromTypeFromArrayView(
   private_data->layout = array_view->layout;
 
   for (int64_t i = 0; i < array_view->n_children; i++) {
-    int result = ArrowArrayInitFromTypeFromArrayView(array->children[i],
-                                                     array_view->children[i], error);
+    int result =
+        ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error);
     if (result != NANOARROW_OK) {
       array->release(array);
       return result;
@@ -1917,7 +1930,18 @@ ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array,
                                         struct ArrowError* error) {
   struct ArrowArrayView array_view;
   NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error));
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromTypeFromArrayView(array, &array_view, error));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error));
+  if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION ||
+      array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) {
+    struct ArrowArrayPrivateData* private_data =
+        (struct ArrowArrayPrivateData*)array->private_data;
+    // We can still build arrays if this isn't true; however, the append
+    // functions won't work. Instead, we store this value and error only
+    // when StartAppending is called.
+    private_data->union_type_id_is_child_index =
+        _ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children);
+  }
+
   ArrowArrayViewReset(&array_view);
   return NANOARROW_OK;
 }
diff --git a/dist/nanoarrow.h b/dist/nanoarrow.h
index 960f143..ff3ee7c 100644
--- a/dist/nanoarrow.h
+++ b/dist/nanoarrow.h
@@ -29,7 +29,7 @@
 
 // #define NANOARROW_NAMESPACE YourNamespaceHere
 
-#define NANOARROW_BUILD_ID "ghadb11c168ecab99ecac9028afbf71427c7107bb12"
+#define NANOARROW_BUILD_ID "ghaec0824dcf064448648cc6eb5ef7873962fa23339"
 
 #endif
 // Licensed to the Apache Software Foundation (ASF) under one
@@ -554,6 +554,11 @@ struct ArrowArrayPrivateData {
 
   // The buffer arrangement for the storage type
   struct ArrowLayout layout;
+
+  // Flag to indicate if there are non-sequence union type ids.
+  // In the future this could be replaced with a type id<->child mapping
+  // to support constructing unions in append mode where type_id != child_index
+  int8_t union_type_id_is_child_index;
 };
 
 #ifdef __cplusplus
@@ -1065,7 +1070,7 @@ struct ArrowSchemaView {
   /// This value is set when parsing a union type and represents
   /// type ids parameter. The ArrowStringView points to
   /// data within the schema and the value is undefined for other types.
-  struct ArrowStringView union_type_ids;
+  const char* union_type_ids;
 };
 
 /// \brief Initialize an ArrowSchemaView
@@ -1348,6 +1353,9 @@ ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array,
 /// \brief Append a null value to an array
 static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n);
 
+/// \brief Append an empty, non-null value to an array
+static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n);
+
 /// \brief Append a signed integer value to an array
 ///
 /// Returns NANOARROW_OK if value can be exactly represented by
@@ -1398,6 +1406,15 @@ static inline ArrowErrorCode ArrowArrayAppendString(struct ArrowArray* array,
 /// length of the child array(s) did not match the expected length.
 static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array);
 
+/// \brief Finish a union array element
+///
+/// Appends an element to the union type ids buffer and increments array->length.
+/// For sparse unions, up to one element is added to non type-id children. Returns
+/// EINVAL if the underlying storage type is not a union, if type_id is not valid,
+/// or if child sizes after appending are inconsistent.
+static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array,
+                                                          int8_t type_id);
+
 /// \brief Shrink buffer capacity to the size required
 ///
 /// Also applies shrinking to any child arrays. array must have been allocated using
@@ -2081,10 +2098,87 @@ static inline struct ArrowBuffer* ArrowArrayBuffer(struct ArrowArray* array, int
   }
 }
 
+// We don't currently support the case of unions where type_id != child_index;
+// however, these functions are used to keep track of where that assumption
+// is made.
+static inline int8_t _ArrowArrayUnionChildIndex(struct ArrowArray* array,
+                                                int8_t type_id) {
+  return type_id;
+}
+
+static inline int8_t _ArrowArrayUnionTypeId(struct ArrowArray* array,
+                                            int8_t child_index) {
+  return child_index;
+}
+
+static inline int8_t _ArrowParseUnionTypeIds(const char* type_ids, int8_t* out) {
+  if (*type_ids == '\0') {
+    return 0;
+  }
+
+  int32_t i = 0;
+  long type_id;
+  char* end_ptr;
+  do {
+    type_id = strtol(type_ids, &end_ptr, 10);
+    if (end_ptr == type_ids || type_id < 0 || type_id > 127) {
+      return -1;
+    }
+
+    if (out != NULL) {
+      out[i] = type_id;
+    }
+
+    i++;
+
+    type_ids = end_ptr;
+    if (*type_ids == '\0') {
+      return i;
+    } else if (*type_ids != ',') {
+      return -1;
+    } else {
+      type_ids++;
+    }
+  } while (1);
+
+  return -1;
+}
+
+static inline int8_t _ArrowUnionTypeIdsWillEqualChildIndices(const char* type_id_str,
+                                                             int64_t n_children) {
+  int8_t type_ids[128];
+  int8_t n_type_ids = _ArrowParseUnionTypeIds(type_id_str, type_ids);
+  if (n_type_ids != n_children) {
+    return 0;
+  }
+
+  for (int8_t i = 0; i < n_type_ids; i++) {
+    if (type_ids[i] != i) {
+      return 0;
+    }
+  }
+
+  return 1;
+}
+
 static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
+  switch (private_data->storage_type) {
+    case NANOARROW_TYPE_UNINITIALIZED:
+      return EINVAL;
+    case NANOARROW_TYPE_SPARSE_UNION:
+    case NANOARROW_TYPE_DENSE_UNION:
+      // Note that this value could be -1 if the type_ids string was invalid
+      if (private_data->union_type_id_is_child_index != 1) {
+        return EINVAL;
+      } else {
+        break;
+      }
+    default:
+      break;
+  }
   if (private_data->storage_type == NANOARROW_TYPE_UNINITIALIZED) {
     return EINVAL;
   }
@@ -2140,7 +2234,8 @@ static inline ArrowErrorCode _ArrowArrayAppendBits(struct ArrowArray* array,
   return NANOARROW_OK;
 }
 
-static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) {
+static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* array,
+                                                            int64_t n, uint8_t is_valid) {
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
@@ -2148,21 +2243,71 @@ static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int6
     return NANOARROW_OK;
   }
 
-  if (private_data->storage_type == NANOARROW_TYPE_NA) {
-    array->null_count += n;
-    array->length += n;
-    return NANOARROW_OK;
+  // Some type-specific handling
+  switch (private_data->storage_type) {
+    case NANOARROW_TYPE_NA:
+      // (An empty value for a null array *is* a null)
+      array->null_count += n;
+      array->length += n;
+      return NANOARROW_OK;
+
+    case NANOARROW_TYPE_DENSE_UNION: {
+      // Add one null to the first child and append n references to that child
+      int8_t type_id = _ArrowArrayUnionTypeId(array, 0);
+      NANOARROW_RETURN_NOT_OK(
+          _ArrowArrayAppendEmptyInternal(array->children[0], 1, is_valid));
+      NANOARROW_RETURN_NOT_OK(
+          ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n));
+      for (int64_t i = 0; i < n; i++) {
+        NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(
+            ArrowArrayBuffer(array, 1), (int32_t)array->children[0]->length - 1));
+      }
+      // For the purposes of array->null_count, union elements are never considered "null"
+      // even if some children contain nulls.
+      array->length += n;
+      return NANOARROW_OK;
+    }
+
+    case NANOARROW_TYPE_SPARSE_UNION: {
+      // Add n nulls to the first child and append n references to that child
+      int8_t type_id = _ArrowArrayUnionTypeId(array, 0);
+      NANOARROW_RETURN_NOT_OK(
+          _ArrowArrayAppendEmptyInternal(array->children[0], n, is_valid));
+      for (int64_t i = 1; i < array->n_children; i++) {
+        NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n));
+      }
+
+      NANOARROW_RETURN_NOT_OK(
+          ArrowBufferAppendFill(ArrowArrayBuffer(array, 0), type_id, n));
+      // For the purposes of array->null_count, union elements are never considered "null"
+      // even if some children contain nulls.
+      array->length += n;
+      return NANOARROW_OK;
+    }
+
+    case NANOARROW_TYPE_FIXED_SIZE_LIST:
+      NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(
+          array->children[0], n * private_data->layout.child_size_elements));
+      break;
+    case NANOARROW_TYPE_STRUCT:
+      for (int64_t i = 0; i < array->n_children; i++) {
+        NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], n));
+      }
+      break;
+
+    default:
+      break;
   }
 
-  // Append n 0 bits to the validity bitmap. If we haven't allocated a bitmap yet, do it
-  // now
-  if (private_data->bitmap.buffer.data == NULL) {
+  // Append n is_valid bits to the validity bitmap. If we haven't allocated a bitmap yet
+  // and we need to append nulls, do it now.
+  if (!is_valid && private_data->bitmap.buffer.data == NULL) {
     NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, array->length + n));
     ArrowBitmapAppendUnsafe(&private_data->bitmap, 1, array->length);
-    ArrowBitmapAppendUnsafe(&private_data->bitmap, 0, n);
-  } else {
+    ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n);
+  } else if (private_data->bitmap.buffer.data != NULL) {
     NANOARROW_RETURN_NOT_OK(ArrowBitmapReserve(&private_data->bitmap, n));
-    ArrowBitmapAppendUnsafe(&private_data->bitmap, 0, n);
+    ArrowBitmapAppendUnsafe(&private_data->bitmap, is_valid, n);
   }
 
   // Add appropriate buffer fill
@@ -2200,31 +2345,24 @@ static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int6
 
       case NANOARROW_BUFFER_TYPE_TYPE_ID:
       case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
-        // Not supported
+        // These cases return above
         return EINVAL;
     }
   }
 
-  // For fixed-size list and struct we need to append some nulls to
-  // children for the lengths to line up properly
-  switch (private_data->storage_type) {
-    case NANOARROW_TYPE_FIXED_SIZE_LIST:
-      NANOARROW_RETURN_NOT_OK(ArrowArrayAppendNull(
-          array->children[0], n * private_data->layout.child_size_elements));
-      break;
-    case NANOARROW_TYPE_STRUCT:
-      for (int64_t i = 0; i < array->n_children; i++) {
-        NANOARROW_RETURN_NOT_OK(ArrowArrayAppendNull(array->children[i], n));
-      }
-    default:
-      break;
-  }
-
   array->length += n;
-  array->null_count += n;
+  array->null_count += n * !is_valid;
   return NANOARROW_OK;
 }
 
+static inline ArrowErrorCode ArrowArrayAppendNull(struct ArrowArray* array, int64_t n) {
+  return _ArrowArrayAppendEmptyInternal(array, n, 0);
+}
+
+static inline ArrowErrorCode ArrowArrayAppendEmpty(struct ArrowArray* array, int64_t n) {
+  return _ArrowArrayAppendEmptyInternal(array, n, 1);
+}
+
 static inline ArrowErrorCode ArrowArrayAppendInt(struct ArrowArray* array,
                                                  int64_t value) {
   struct ArrowArrayPrivateData* private_data =
@@ -2473,6 +2611,50 @@ static inline ArrowErrorCode ArrowArrayFinishElement(struct ArrowArray* array) {
   return NANOARROW_OK;
 }
 
+static inline ArrowErrorCode ArrowArrayFinishUnionElement(struct ArrowArray* array,
+                                                          int8_t type_id) {
+  struct ArrowArrayPrivateData* private_data =
+      (struct ArrowArrayPrivateData*)array->private_data;
+
+  int64_t child_index = _ArrowArrayUnionChildIndex(array, type_id);
+  if (child_index < 0 || child_index >= array->n_children) {
+    return EINVAL;
+  }
+
+  switch (private_data->storage_type) {
+    case NANOARROW_TYPE_DENSE_UNION:
+      // Apppend the target child length to the union offsets buffer
+      _NANOARROW_CHECK_RANGE(array->children[child_index]->length, 0, INT32_MAX);
+      NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(
+          ArrowArrayBuffer(array, 1), (int32_t)array->children[child_index]->length - 1));
+      break;
+    case NANOARROW_TYPE_SPARSE_UNION:
+      // Append one empty to any non-target column that isn't already the right length
+      // or abort if appending a null will result in a column with invalid length
+      for (int64_t i = 0; i < array->n_children; i++) {
+        if (i == child_index || array->children[i]->length == (array->length + 1)) {
+          continue;
+        }
+
+        if (array->children[i]->length != array->length) {
+          return EINVAL;
+        }
+
+        NANOARROW_RETURN_NOT_OK(ArrowArrayAppendEmpty(array->children[i], 1));
+      }
+
+      break;
+    default:
+      return EINVAL;
+  }
+
+  // Write to the type_ids buffer
+  NANOARROW_RETURN_NOT_OK(
+      ArrowBufferAppendInt8(ArrowArrayBuffer(array, 0), (int8_t)type_id));
+  array->length++;
+  return NANOARROW_OK;
+}
+
 static inline void ArrowArrayViewMove(struct ArrowArrayView* src,
                                       struct ArrowArrayView* dst) {
   memcpy(dst, src, sizeof(struct ArrowArrayView));