You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/29 18:45:13 UTC

[GitHub] [arrow] bkietz commented on a change in pull request #10606: ARROW-13005: [C++] Add support for take implementation on dense union type

bkietz commented on a change in pull request #10606:
URL: https://github.com/apache/arrow/pull/10606#discussion_r660855712



##########
File path: cpp/src/arrow/type_traits.h
##########
@@ -435,6 +435,8 @@ struct TypeTraits<DenseUnionType> {
   using ArrayType = DenseUnionArray;
   using BuilderType = DenseUnionBuilder;
   using ScalarType = DenseUnionScalar;
+  using ValueOffsetBuilderType = Int32Builder;
+  using ChildIdBuilderType = Int8Builder;

Review comment:
       These are not generally useful traits since they're not shared by other types. Please revert them

##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1279,6 +1279,58 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) {
       struct_type, R"([{"a": 1}, {"a": 2, "b": "hello"}])", "[0, 1, 0]");
 }
 
+class TestTakeKernelWithDenseUnion : public TestTakeKernelTyped<DenseUnionType> {
+ public:
+  void AssertTake(const std::shared_ptr<DataType>& type, const std::string& values,
+                  const std::string& indices, const std::string& type_codes,
+                  const std::string& value_offsets) {
+    auto union_array =
+        std::static_pointer_cast<DenseUnionArray>(ArrayFromJSON(type, values));
+    ArrayVector children;
+    children.reserve(type->num_fields());
+    std::vector<std::string> field_names;
+    field_names.reserve(type->num_fields());
+    for (int i = 0; i < type->num_fields(); i++) {
+      children.push_back(union_array->field(i));
+      field_names.push_back(type->field(i)->name());
+    }
+    ASSIGN_OR_ABORT(
+        std::shared_ptr<Array> expected,
+        DenseUnionArray::Make(*ArrayFromJSON(int8(), type_codes),
+                              *ArrayFromJSON(int32(), value_offsets), children,
+                              field_names, union_array->union_type()->type_codes()));
+

Review comment:
       ```suggestion
       ASSERT_OK(expected->ValidateFull());
   ```

##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1279,6 +1279,58 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) {
       struct_type, R"([{"a": 1}, {"a": 2, "b": "hello"}])", "[0, 1, 0]");
 }
 
+class TestTakeKernelWithDenseUnion : public TestTakeKernelTyped<DenseUnionType> {
+ public:
+  void AssertTake(const std::shared_ptr<DataType>& type, const std::string& values,
+                  const std::string& indices, const std::string& type_codes,
+                  const std::string& value_offsets) {
+    auto union_array =
+        std::static_pointer_cast<DenseUnionArray>(ArrayFromJSON(type, values));
+    ArrayVector children;
+    children.reserve(type->num_fields());
+    std::vector<std::string> field_names;
+    field_names.reserve(type->num_fields());
+    for (int i = 0; i < type->num_fields(); i++) {
+      children.push_back(union_array->field(i));
+      field_names.push_back(type->field(i)->name());
+    }
+    ASSIGN_OR_ABORT(

Review comment:
       ```suggestion
       ASSERT_OK_AND_ASSIGN(
   ```

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -1668,6 +1668,66 @@ struct ListImpl : public Selection<ListImpl<Type>, Type> {
   }
 };
 
+struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
+  using Base = Selection<DenseUnionImpl, DenseUnionType>;
+  LIFT_BASE_MEMBERS();
+
+  typename TypeTraits<DenseUnionType>::ValueOffsetBuilderType value_offset_builder;
+  typename TypeTraits<DenseUnionType>::ChildIdBuilderType child_id_builder;
+  std::vector<int8_t> type_codes;
+
+  DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
+                 Datum* out)
+      : Base(ctx, batch, output_length, out),
+        value_offset_builder(ctx->memory_pool()),
+        child_id_builder(ctx->memory_pool()) {
+    DenseUnionArray typed_values(this->values);
+    type_codes = typed_values.union_type()->type_codes();
+  }
+
+  template <typename Adapter>
+  Status GenerateOutput() {
+    DenseUnionArray typed_values(this->values);
+    Adapter adapter(this);
+    RETURN_NOT_OK(adapter.Generate(
+        [&](int64_t index) {
+          auto child_id = typed_values.child_id(index);
+          child_id_builder.UnsafeAppend(type_codes[child_id]);
+          auto value_offset = typed_values.value_offset(index);
+          value_offset_builder.UnsafeAppend(value_offset);
+          return Status::OK();
+        },
+        // TODO: not able to handle null case
+        VisitNoop));
+    return Status::OK();
+  }
+
+  Status Init() override {
+    RETURN_NOT_OK(child_id_builder.Reserve(output_length));
+    RETURN_NOT_OK(value_offset_builder.Reserve(output_length));
+    return Status::OK();
+  }
+
+  Status Finish() override {
+    std::shared_ptr<Array> child_ids;
+    std::shared_ptr<Array> value_offsets;
+    RETURN_NOT_OK(child_id_builder.Finish(&child_ids));
+    RETURN_NOT_OK(value_offset_builder.Finish(&value_offsets));
+
+    DenseUnionArray typed_values(this->values);
+    auto num_fields = typed_values.num_fields();
+    ArrayVector child_arrays;
+    child_arrays.reserve(num_fields);
+    BufferVector buffers = {nullptr, checked_cast<const Int8Array&>(*child_ids).values(),
+                            checked_cast<const Int32Array&>(*value_offsets).values()};
+    *out = ArrayData(typed_values.type(), child_ids->length(), std::move(buffers), 0);
+    for (int i = 0; i < typed_values.num_fields(); i++) {
+      out->child_data.push_back(typed_values.field(i)->data());

Review comment:
       I think the larger problem here is that we may produce a dense union array whose offsets are not increasing, which is a violation of [the protocol](https://github.com/bkietz/arrow/blob/3c994458183a4585063a4925bad7bf02f29ab93c/docs/source/format/Columnar.rst#L561)

##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1279,6 +1279,58 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) {
       struct_type, R"([{"a": 1}, {"a": 2, "b": "hello"}])", "[0, 1, 0]");
 }
 
+class TestTakeKernelWithDenseUnion : public TestTakeKernelTyped<DenseUnionType> {
+ public:
+  void AssertTake(const std::shared_ptr<DataType>& type, const std::string& values,
+                  const std::string& indices, const std::string& type_codes,
+                  const std::string& value_offsets) {
+    auto union_array =
+        std::static_pointer_cast<DenseUnionArray>(ArrayFromJSON(type, values));
+    ArrayVector children;
+    children.reserve(type->num_fields());
+    std::vector<std::string> field_names;
+    field_names.reserve(type->num_fields());
+    for (int i = 0; i < type->num_fields(); i++) {
+      children.push_back(union_array->field(i));
+      field_names.push_back(type->field(i)->name());
+    }
+    ASSIGN_OR_ABORT(
+        std::shared_ptr<Array> expected,
+        DenseUnionArray::Make(*ArrayFromJSON(int8(), type_codes),
+                              *ArrayFromJSON(int32(), value_offsets), children,
+                              field_names, union_array->union_type()->type_codes()));
+
+    std::shared_ptr<Array> actual;
+
+    for (auto index_type : {int8(), uint32()}) {
+      ASSERT_OK(TakeJSON(type, values, index_type, indices, &actual));
+      ASSERT_OK(actual->Validate());

Review comment:
       ```suggestion
         ASSERT_OK(actual->ValidateFull());
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org