You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/03 21:00:24 UTC

[GitHub] [arrow] bkietz commented on a change in pull request #11843: ARROW-4975: [C++] Support concatenation of UnionArrays

bkietz commented on a change in pull request #11843:
URL: https://github.com/apache/arrow/pull/11843#discussion_r762249811



##########
File path: cpp/src/arrow/array/concatenate_test.cc
##########
@@ -364,27 +364,89 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) {
   AssertArraysEqual(*expected, *concat_actual);
 }
 
-TEST_F(ConcatenateTest, DISABLED_UnionType) {
+TEST_F(ConcatenateTest, UnionType) {
   // sparse mode
   Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
-    auto foo = this->GeneratePrimitive<Int8Type>(size, null_probability);
+    auto foo = this->GeneratePrimitive<Int8Type>(size, 0);
     auto bar = this->GeneratePrimitive<DoubleType>(size, null_probability);
     auto baz = this->GeneratePrimitive<BooleanType>(size, null_probability);
-    auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, null_probability);
+    auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, 0);
     ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz}));
   });
   // dense mode
-  Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
-    auto foo = this->GeneratePrimitive<Int8Type>(size, null_probability);
-    auto bar = this->GeneratePrimitive<DoubleType>(size, null_probability);
-    auto baz = this->GeneratePrimitive<BooleanType>(size, null_probability);
-    auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, null_probability);
-    auto value_offsets = rng_.Numeric<Int32Type>(size, 0, size, 0);
-    ASSERT_OK_AND_ASSIGN(
-        *out, DenseUnionArray::Make(*type_ids, *value_offsets, {foo, bar, baz}));
+  Check([this](int32_t size, double null_probabilities, std::shared_ptr<Array>* out) {
+    *out = rng_.ArrayOf(dense_union({
+                            field("a", uint32()),
+                            field("b", boolean()),
+                            field("c", int8()),
+                        }),
+                        size, null_probabilities);
   });
 }
 
+TEST_F(ConcatenateTest, DenseUnionTypeOverflow) {
+  // Offset overflow
+  auto type_ids = ArrayFromJSON(int8(), "[0]");
+  auto offsets = ArrayFromJSON(int32(), "[2147483646]");
+  auto child_array = ArrayFromJSON(uint8(), "[0, 1]");
+  ASSERT_OK_AND_ASSIGN(auto array,
+                       DenseUnionArray::Make(*type_ids, *offsets, {child_array}));
+  ArrayVector arrays({array, array});
+  ASSERT_RAISES(Invalid, Concatenate(arrays, default_memory_pool()));
+
+  // Length overflow
+  auto type_ids_ok = ArrayFromJSON(int8(), "[0]");
+  auto offsets_ok = ArrayFromJSON(int32(), "[0]");
+  auto child_array_overflow =
+      this->rng_.ArrayOf(null(), std::numeric_limits<int32_t>::max() - 1, 0.0);
+  ASSERT_OK_AND_ASSIGN(
+      auto array_overflow,
+      DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow}));
+  ArrayVector arrays_overflow({array_overflow, array_overflow});
+  ASSERT_RAISES(Invalid, Concatenate(arrays_overflow, default_memory_pool()));
+}
+
+TEST_F(ConcatenateTest, DenseUnionType) {
+  auto type_ids = ArrayFromJSON(int8(), "[0, 0, 1, 1, 0, 1]");
+  auto offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2]");
+  auto child_one = ArrayFromJSON(boolean(), "[true, true, false]");
+  auto child_two = ArrayFromJSON(int8(), "[1, 2, 3]");
+  ASSERT_OK_AND_ASSIGN(
+      auto array, DenseUnionArray::Make(*type_ids, *offsets, {child_one, child_two}));
+  ASSERT_OK(array->ValidateFull());
+
+  ArrayVector arrays({array, array});
+  ASSERT_OK_AND_ASSIGN(auto concat_arrays, Concatenate(arrays, default_memory_pool()));
+  ASSERT_OK(concat_arrays->ValidateFull());
+
+  auto type_ids_expected = ArrayFromJSON(int8(), "[0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1]");
+  auto offsets_expected = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 3, 4, 3, 4, 5, 5]");
+  auto child_one_expected =
+      ArrayFromJSON(boolean(), "[true, true, false, true, true, false]");
+  auto child_two_expected = ArrayFromJSON(int8(), "[1, 2, 3, 1, 2, 3]");
+  ASSERT_OK_AND_ASSIGN(auto expected,
+                       DenseUnionArray::Make(*type_ids_expected, *offsets_expected,
+                                             {child_one_expected, child_two_expected}));
+  ASSERT_OK(expected->ValidateFull());
+  AssertArraysEqual(*expected, *concat_arrays);
+
+  ArrayVector sliced_arrays({array->Slice(1, 4), array});
+  ASSERT_OK_AND_ASSIGN(auto concat_sliced_arrays,
+                       Concatenate(sliced_arrays, default_memory_pool()));
+  ASSERT_OK(concat_sliced_arrays->ValidateFull());
+
+  auto type_ids_sliced = ArrayFromJSON(int8(), "[0, 1, 1, 0, 0, 0, 1, 1, 0, 1]");
+  auto offsets_sliced = ArrayFromJSON(int32(), "[1, 0, 1, 2, 3, 4, 3, 4, 5, 5]");
+  auto child_one_sliced =
+      ArrayFromJSON(boolean(), "[true, true, false, true, true, false]");
+  auto child_two_sliced = ArrayFromJSON(int8(), "[1, 2, 3, 1, 2, 3]");
+  ASSERT_OK_AND_ASSIGN(auto expected_sliced,
+                       DenseUnionArray::Make(*type_ids_sliced, *offsets_sliced,
+                                             {child_one_sliced, child_two_sliced}));
+  ASSERT_OK(expected_sliced->ValidateFull());
+  AssertArraysEqual(*expected_sliced, *concat_sliced_arrays);

Review comment:
       I think for for most of these arrays it's acceptable and more readable to come straight from JSON
   ```suggestion
     AssertArraysEqual(*ArrayFromJSON(dense_union({field("", boolean()), field("", int8())}), R"([
       [0, true],
       [1, 1],
       [1, 2],
       [0, true],
       [0, false],
       [0, true],
       [1, 3],
       [1, 1],
       [0, true],
       [1, 3]
     ])"), *concat_sliced_arrays);
   ```
   
   The only case where you *need* to construct the dense union array with explicit children is when you're making an array with sliced children

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -349,7 +350,90 @@ class ConcatenateImpl {
   }
 
   Status Visit(const UnionType& u) {
-    return Status::NotImplemented("concatenation of ", u);
+    // This implementation assumes that all input arrays are valid union arrays
+    // with same number of variants.
+
+    // Concatenate the type buffers.
+    ARROW_ASSIGN_OR_RAISE(auto type_buffers, Buffers(1, sizeof(int8_t)));
+    RETURN_NOT_OK(ConcatenateBuffers(type_buffers, pool_).Value(&out_->buffers[1]));
+
+    // Concatenate the child data. For sparse unions the child data is sliced
+    // based on the offset and length of the array data. For dense unions the
+    // child data is not sliced because this makes constructing the concatenated
+    // offsets buffer more simple. We could however choose to modify this and
+    // slice the child arrays and reflect this in the concatenated offsets
+    // buffer.
+    switch (u.mode()) {
+      case UnionMode::SPARSE: {
+        for (int i = 0; i < u.num_fields(); i++) {
+          ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
+          RETURN_NOT_OK(
+              ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
+        }
+        break;
+      }
+      case UnionMode::DENSE: {
+        for (int i = 0; i < u.num_fields(); i++) {
+          ArrayDataVector child_data(in_.size());
+          for (size_t j = 0; j < in_.size(); j++) {
+            child_data[j] = in_[j]->child_data[i];
+          }
+          RETURN_NOT_OK(
+              ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
+        }
+        break;
+      }
+    }
+
+    // Concatenate offsets buffers for dense union arrays.
+    if (u.mode() == UnionMode::DENSE) {
+      // The number of offset values is equal to the number of type_ids in the
+      // concatenated type buffers.
+      TypedBufferBuilder<int32_t> builder;
+      RETURN_NOT_OK(builder.Reserve(out_->length));
+
+      // Initialize a vector for child array lengths. These are updated during
+      // iteration over the input arrays to track the concatenated child array
+      // lengths. These lengths are used as offsets for the concatenated offsets
+      // buffer.
+      std::vector<int32_t> offset_map(u.num_fields());
+
+      // Iterate over all input arrays.
+      for (size_t i = 0; i < in_.size(); i++) {
+        // Get sliced type ids and offsets.
+        auto type_ids = in_[i]->GetValues<int8_t>(1);
+        auto offset_values = in_[i]->GetValues<int32_t>(2);
+
+        // Iterate over all elements in the type buffer and append the updated
+        // offset to the concatenated offsets buffer.
+        for (auto j = 0; j < in_[i]->length; j++) {
+          int32_t offset;
+          if (internal::AddWithOverflow(offset_map[type_ids[j]], offset_values[j],

Review comment:
       Please rewrite the unit tests to use type_codes other than 0..n

##########
File path: cpp/src/arrow/array/array_nested.cc
##########
@@ -708,10 +708,6 @@ DenseUnionArray::DenseUnionArray(std::shared_ptr<DataType> type, int64_t length,
 Result<std::shared_ptr<Array>> DenseUnionArray::Make(
     const Array& type_ids, const Array& value_offsets, ArrayVector children,
     std::vector<std::string> field_names, std::vector<type_code_t> type_codes) {
-  if (value_offsets.length() == 0) {

Review comment:
       :+1: 

##########
File path: cpp/src/arrow/array/concatenate.cc
##########
@@ -349,7 +350,90 @@ class ConcatenateImpl {
   }
 
   Status Visit(const UnionType& u) {
-    return Status::NotImplemented("concatenation of ", u);
+    // This implementation assumes that all input arrays are valid union arrays
+    // with same number of variants.
+
+    // Concatenate the type buffers.
+    ARROW_ASSIGN_OR_RAISE(auto type_buffers, Buffers(1, sizeof(int8_t)));
+    RETURN_NOT_OK(ConcatenateBuffers(type_buffers, pool_).Value(&out_->buffers[1]));
+
+    // Concatenate the child data. For sparse unions the child data is sliced
+    // based on the offset and length of the array data. For dense unions the
+    // child data is not sliced because this makes constructing the concatenated
+    // offsets buffer more simple. We could however choose to modify this and
+    // slice the child arrays and reflect this in the concatenated offsets
+    // buffer.
+    switch (u.mode()) {
+      case UnionMode::SPARSE: {
+        for (int i = 0; i < u.num_fields(); i++) {
+          ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
+          RETURN_NOT_OK(
+              ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
+        }
+        break;
+      }
+      case UnionMode::DENSE: {
+        for (int i = 0; i < u.num_fields(); i++) {
+          ArrayDataVector child_data(in_.size());
+          for (size_t j = 0; j < in_.size(); j++) {
+            child_data[j] = in_[j]->child_data[i];
+          }
+          RETURN_NOT_OK(
+              ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i]));
+        }
+        break;
+      }
+    }
+
+    // Concatenate offsets buffers for dense union arrays.
+    if (u.mode() == UnionMode::DENSE) {
+      // The number of offset values is equal to the number of type_ids in the
+      // concatenated type buffers.
+      TypedBufferBuilder<int32_t> builder;
+      RETURN_NOT_OK(builder.Reserve(out_->length));
+
+      // Initialize a vector for child array lengths. These are updated during
+      // iteration over the input arrays to track the concatenated child array
+      // lengths. These lengths are used as offsets for the concatenated offsets
+      // buffer.
+      std::vector<int32_t> offset_map(u.num_fields());
+
+      // Iterate over all input arrays.
+      for (size_t i = 0; i < in_.size(); i++) {
+        // Get sliced type ids and offsets.
+        auto type_ids = in_[i]->GetValues<int8_t>(1);
+        auto offset_values = in_[i]->GetValues<int32_t>(2);
+
+        // Iterate over all elements in the type buffer and append the updated
+        // offset to the concatenated offsets buffer.
+        for (auto j = 0; j < in_[i]->length; j++) {
+          int32_t offset;
+          if (internal::AddWithOverflow(offset_map[type_ids[j]], offset_values[j],

Review comment:
       Unfortunately, union type ids are slightly subtler than this: the values in the type ids buffer are drawn from UnionType::type_codes and are not necessarily a valid index into child_data. For example, consider the an array of `dense_union<int32=2, string=5>`. This array's type_ids buffer would be `[2, 5, 2]` to indicate int, string, int
   
   UnionType::child_ids is a convenient mapping from type_id to child_data index




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org