You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/07/28 15:17:55 UTC

[GitHub] [arrow] bkietz commented on a change in pull request #10817: ARROW-9434: [C++] Store type code in UnionScalar

bkietz commented on a change in pull request #10817:
URL: https://github.com/apache/arrow/pull/10817#discussion_r678384557



##########
File path: cpp/src/arrow/array/array_base.cc
##########
@@ -103,26 +103,28 @@ struct ScalarFromArraySlotImpl {
   }
 
   Status Visit(const SparseUnionArray& a) {
+    const auto type_code = a.type_code(index_);
     // child array which stores the actual value
-    auto arr = a.field(a.child_id(index_));
+    const auto arr = a.field(a.child_id(index_));
     // no need to adjust the index
     ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(index_));
     if (value->is_valid) {
-      out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(value, a.type()));
+      out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(value, type_code, a.type()));
     } else {
       out_ = MakeNullScalar(a.type());

Review comment:
       It'd probably be better to preserve the type_code of the null slot for consistency
   ```suggestion
         out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(type_code, a.type()));
   ```

##########
File path: cpp/src/arrow/scalar.cc
##########
@@ -97,8 +97,13 @@ struct ScalarHashImpl {
     return Status::OK();
   }
 
-  // TODO(bkietz) implement less wimpy hashing when these have ValueType

Review comment:
       Thanks for fixing this too

##########
File path: cpp/src/arrow/array/builder_base.cc
##########
@@ -206,6 +207,45 @@ struct AppendScalarImpl {
     return Status::OK();
   }
 
+  Status Visit(const SparseUnionType& type) { return MakeUnionArray(type); }
+
+  Status Visit(const DenseUnionType& type) { return MakeUnionArray(type); }
+
+  template <typename T>
+  Status MakeUnionArray(const T& type) {
+    using BuilderType = typename TypeTraits<T>::BuilderType;
+
+    auto* builder = internal::checked_cast<BuilderType*>(builder_);
+    const auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
+
+    RETURN_NOT_OK(builder->Reserve(count));
+
+    DCHECK_EQ(type.num_fields(), builder->num_children());
+    for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+      RETURN_NOT_OK(builder->child_builder(field_index)->Reserve(count));
+    }
+
+    for (int64_t i = 0; i < n_repeats_; i++) {
+      for (const std::shared_ptr<Scalar>* s = scalars_begin_; s != scalars_end_; s++) {
+        // For each scalar, 1. append the type code, 2. append the value to
+        // the corresponding child (and append null to the other children)

Review comment:
       This seems inappropriate for the dense array; for that case shouldn't we *only* append to the child builder which corresponds to the appended scalar's type code? What you've written doesn't produce an invalid dense union array but it will be somewhat pessimized since there will be unnecessary null slots

##########
File path: cpp/src/arrow/scalar_test.cc
##########
@@ -930,117 +930,134 @@ TEST(TestDictionaryScalar, Cast) {
   }
 }
 
-TEST(TestSparseUnionScalar, Basics) {
-  auto ty = sparse_union({field("string", utf8()), field("number", uint64())});
+void CheckGetValidUnionScalar(const Array& arr, int64_t index, const Scalar& expected,
+                              const Scalar& expected_value) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, arr.GetScalar(index));
+  ASSERT_TRUE(scalar->Equals(expected));
+
+  const auto& as_union = checked_cast<const UnionScalar&>(*scalar);
+  ASSERT_TRUE(as_union.is_valid);
+  ASSERT_TRUE(as_union.value->Equals(expected_value));
+}
 
-  auto alpha = MakeScalar("alpha");
-  auto beta = MakeScalar("beta");
-  ASSERT_OK_AND_ASSIGN(auto two, MakeScalar(uint64(), 2));
+void CheckGetNullUnionScalar(const Array& arr, int64_t index) {
+  ASSERT_OK_AND_ASSIGN(auto scalar, arr.GetScalar(index));
+  ASSERT_TRUE(scalar->Equals(MakeNullScalar(arr.type())));
 
-  auto scalar_alpha = SparseUnionScalar(alpha, ty);
-  auto scalar_beta = SparseUnionScalar(beta, ty);
-  auto scalar_two = SparseUnionScalar(two, ty);
+  const auto& as_union = checked_cast<const UnionScalar&>(*scalar);
+  ASSERT_FALSE(as_union.is_valid);
+  // XXX in reality, the union array doesn't have a validity bitmap.
+  // Validity is inferred from the underlying child value, which should maybe
+  // be reflected here...
+  ASSERT_EQ(as_union.value, nullptr);

Review comment:
       In this case, I think allowing UnionScalar::value to be nullptr instead of MakeNullScalar(...) is tantamount to considering shared_ptr<Scalar>(nullptr) to be an acceptable way to represent a null scalar. I think we should instead maintain scalars which are "shaped" as identically to their corresponding arrays as possible, and in this case I think that requires a non-nullptr UnionScalar::value.
   ```suggestion
     ASSERT_TRUE(as_union.value->is_valid);
   ```

##########
File path: cpp/src/arrow/array/array_base.cc
##########
@@ -103,26 +103,28 @@ struct ScalarFromArraySlotImpl {
   }
 
   Status Visit(const SparseUnionArray& a) {
+    const auto type_code = a.type_code(index_);
     // child array which stores the actual value
-    auto arr = a.field(a.child_id(index_));
+    const auto arr = a.field(a.child_id(index_));
     // no need to adjust the index
     ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(index_));
     if (value->is_valid) {
-      out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(value, a.type()));
+      out_ = std::shared_ptr<Scalar>(new SparseUnionScalar(value, type_code, a.type()));
     } else {
       out_ = MakeNullScalar(a.type());
     }
     return Status::OK();
   }
 
   Status Visit(const DenseUnionArray& a) {
+    const auto type_code = a.type_code(index_);
     // child array which stores the actual value
     auto arr = a.field(a.child_id(index_));
     // need to look up the value based on offsets
     auto offset = a.value_offset(index_);
     ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(offset));
     if (value->is_valid) {
-      out_ = std::shared_ptr<Scalar>(new DenseUnionScalar(value, a.type()));
+      out_ = std::shared_ptr<Scalar>(new DenseUnionScalar(value, type_code, a.type()));
     } else {
       out_ = MakeNullScalar(a.type());

Review comment:
       Same:
   ```suggestion
         out_ = std::shared_ptr<Scalar>(new DenseUnionScalar(type_code, a.type()));
   ```

##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -613,18 +614,85 @@ class RepeatedArrayFactory {
     return Status::OK();
   }
 
-  Status Visit(const ExtensionType& type) {
-    return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+  Status Visit(const SparseUnionType& type) {
+    const auto& union_scalar = checked_cast<const UnionScalar&>(scalar_);
+    const auto& union_type = checked_cast<const UnionType&>(*scalar_.type);
+    const auto scalar_type_code = union_scalar.type_code;
+    const auto scalar_child_id = union_type.child_ids()[scalar_type_code];
+
+    // Create child arrays: most of them are all-null, except for the child array
+    // for the given type code (if the scalar is valid).
+    ArrayVector fields;
+    for (int i = 0; i < type.num_fields(); ++i) {
+      fields.emplace_back();
+      if (i == scalar_child_id && scalar_.is_valid) {
+        ARROW_ASSIGN_OR_RAISE(fields.back(),
+                              MakeArrayFromScalar(*union_scalar.value, length_, pool_));
+      } else {
+        ARROW_ASSIGN_OR_RAISE(
+            fields.back(), MakeArrayOfNull(union_type.field(i)->type(), length_, pool_));
+      }
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto type_codes_buffer, CreateUnionTypeCodes(scalar_type_code));
+
+    out_ = std::make_shared<SparseUnionArray>(scalar_.type, length_, std::move(fields),
+                                              std::move(type_codes_buffer));
+    return Status::OK();
   }
 
   Status Visit(const DenseUnionType& type) {
-    return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+    const auto& union_scalar = checked_cast<const UnionScalar&>(scalar_);
+    const auto& union_type = checked_cast<const UnionType&>(*scalar_.type);
+    const auto scalar_type_code = union_scalar.type_code;
+    const auto scalar_child_id = union_type.child_ids()[scalar_type_code];
+
+    // Create child arrays: all of them are empty, except for the child array
+    // for the given type code (if length > 0).
+    ArrayVector fields;
+    for (int i = 0; i < type.num_fields(); ++i) {
+      fields.emplace_back();
+      if (i == scalar_child_id && length_ > 0) {
+        if (scalar_.is_valid) {
+          // One valid element (will be referenced by multiple offsets)

Review comment:
       (we've had this discussion before, [see also](https://github.com/apache/arrow/pull/9794)) I'm not convinced this is legal- [Columnar.rst:561](https://github.com/bkietz/arrow/blob/fb117988139df607984594aac39c3ea59cecbe32/docs/source/format/Columnar.rst#L561) states that offsets must be "in order / increasing", not "in order / non decreasing" or "in order / monotonic".
   
   Clearly since CI is passing existing code accepts runs of equal offsets (and off the top of my head I can't think of a reason this would be problematic). In light of that: please open a follow up JIRA to rewrite that line of Columnar.rst to explicitly allow runs of equal offsets.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org