You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/05/31 10:15:42 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #35036: GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls

pitrou commented on code in PR #35036:
URL: https://github.com/apache/arrow/pull/35036#discussion_r1211427115


##########
cpp/src/arrow/compute/kernels/scalar_validity.cc:
##########
@@ -82,6 +84,72 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
   }
 }
 
+static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                          int64_t out_offset) {
+  const auto* sparse_union_type =
+      ::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = sparse_union_type->child_ids()[types[i]];
+    if (span.child_data[child_id].IsNull(i + span.offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                         int64_t out_offset) {
+  const auto* dense_union_type =
+      ::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);      // NOLINT
+  const int32_t* offsets = span.GetValues<int32_t>(2);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = dense_union_type->child_ids()[types[i]];
+    const int32_t offset = offsets[i];
+    if (span.child_data[child_id].IsNull(offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+template <typename RunEndCType>
+void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                           int64_t out_offset) {
+  const auto& values = arrow::ree_util::ValuesArray(span);
+  DCHECK(!is_nested(values.type->id()));

Review Comment:
   AFAIK, it is not forbidden to have nested REE values. So this should be turned into an error return (perhaps `Status::NotImplemented`) at a higher level.
   
   However, instead of refusing to implement this, you could use another strategy:
   * call `IsNull` on the values array
   * REE-decode the REE array comprised of (run ends, run values IsNull)
   



##########
cpp/src/arrow/compute/kernels/scalar_validity_test.cc:
##########
@@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
                    "[true, false, false, true]", &nan_is_null_options);
 }
 
+TEST_F(TestBooleanValidityKernels, IsNullUnion) {
+  auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]");
+  auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])");
+  auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])");
+  ASSERT_OK_AND_ASSIGN(auto arr1,
+                       SparseUnionArray::Make(*type_ids, {field_i64, field_str}));
+  auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]");
+  CheckScalarUnary("is_null", arr1, expected);
+
+  auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]");
+  auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])");
+  auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])");
+  ASSERT_OK_AND_ASSIGN(auto arr2,
+                       DenseUnionArray::Make(*type_ids, *value_offsets,
+                                             {dense_field_i64, dense_field_str}));
+  CheckScalarUnary("is_null", arr2, expected);
+}

Review Comment:
   Can you add floating-point child union values and test with/without the `nan_is_null` option set?



##########
cpp/src/arrow/array/data.cc:
##########
@@ -236,6 +236,18 @@ void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_siz
   span->buffers[buffer_index].size = 2 * sizeof(offset_type);
 }
 
+template <typename RunEndType>
+void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) {

Review Comment:
   If so, can you perhaps add a test for `FillFromScalar` on a run-end-encoded scalar? For example in `arrow/array/array_run_end_test.cc`.



##########
cpp/src/arrow/array/data.cc:
##########
@@ -304,9 +316,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
 
   Type::type type_id = value.type->id();
 
-  // Populate null count and validity bitmap (only for non-union/null types)
-  this->null_count = value.is_valid ? 0 : 1;
-  if (!is_union(type_id) && type_id != Type::NA) {
+  // Populate null count and validity bitmap
+  if (type_id == Type::NA) {
+    this->null_count = 1;
+  } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {

Review Comment:
   ```suggestion
     } else if (!HasValidityBitmap(type_id)) {
   ```



##########
python/pyarrow/tests/test_compute.py:
##########
@@ -1637,6 +1637,48 @@ def test_is_null():
     assert result.equals(expected)
 
 
+def test_is_null_union():

Review Comment:
   Is it necessary to add these tests on the Python side? You have similar tests in C++ already.



##########
cpp/src/arrow/array/data.cc:
##########
@@ -236,6 +236,18 @@ void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_siz
   span->buffers[buffer_index].size = 2 * sizeof(offset_type);
 }
 
+template <typename RunEndType>
+void FillRunEndsArrayForScalar(ArraySpan* span, const DataType* run_end_type) {

Review Comment:
   Are these changes related for this PR?



##########
cpp/src/arrow/compute/kernels/scalar_validity_test.cc:
##########
@@ -92,6 +92,34 @@ TEST_F(TestBooleanValidityKernels, IsNull) {
                    "[true, false, false, true]", &nan_is_null_options);
 }
 
+TEST_F(TestBooleanValidityKernels, IsNullUnion) {
+  auto field_i64 = ArrayFromJSON(int64(), "[null, 127, null, null, null]");
+  auto field_str = ArrayFromJSON(utf8(), R"(["abcd", null, null, null, ""])");
+  auto type_ids = ArrayFromJSON(int8(), R"([1, 0, 0, 1, 1])");
+  ASSERT_OK_AND_ASSIGN(auto arr1,
+                       SparseUnionArray::Make(*type_ids, {field_i64, field_str}));
+  auto expected = ArrayFromJSON(boolean(), "[false, false, true, true, false]");
+  CheckScalarUnary("is_null", arr1, expected);
+
+  auto dense_field_i64 = ArrayFromJSON(int64(), "[127, null]");
+  auto dense_field_str = ArrayFromJSON(utf8(), R"(["abcd", null, ""])");
+  auto value_offsets = ArrayFromJSON(int32(), R"([0, 0, 1, 1, 2])");
+  ASSERT_OK_AND_ASSIGN(auto arr2,
+                       DenseUnionArray::Make(*type_ids, *value_offsets,
+                                             {dense_field_i64, dense_field_str}));
+  CheckScalarUnary("is_null", arr2, expected);
+}
+
+TEST_F(TestBooleanValidityKernels, IsNullRunEndEncoded) {
+  auto run_ends = ArrayFromJSON(int32(), R"([2, 3, 5, 7])");
+  auto values = ArrayFromJSON(int64(), R"([1, 2, null, 3])");
+  ASSERT_OK_AND_ASSIGN(auto ree_array, RunEndEncodedArray::Make(7, run_ends, values));
+  ASSERT_OK(ree_array->ValidateFull());
+  auto expected =
+      ArrayFromJSON(boolean(), "[false, false, false, true, true, false, false]");
+  CheckScalarUnary("is_null", ree_array, expected);
+}

Review Comment:
   Can you add a test with floating-point run end values and clearing/setting the `nan_is_null` option?



##########
cpp/src/arrow/array/data.cc:
##########
@@ -422,6 +438,22 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
         this->child_data[i].FillFromScalar(*scalar.value[i]);
       }
     }
+  } else if (type_id == Type::RUN_END_ENCODED) {
+    const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value);
+    this->child_data.resize(2);
+    auto& run_end_type = scalar.run_end_type();

Review Comment:
   Nit: style
   ```suggestion
       const auto& run_end_type = scalar.run_end_type();
   ```



##########
cpp/src/arrow/array/data.cc:
##########
@@ -304,9 +304,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
 
   Type::type type_id = value.type->id();
 
-  // Populate null count and validity bitmap (only for non-union/null types)
-  this->null_count = value.is_valid ? 0 : 1;
-  if (!is_union(type_id) && type_id != Type::NA) {
+  // Populate null count and validity bitmap
+  if (type_id == Type::NA) {
+    this->null_count = 1;
+  } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
+    this->null_count = 0;
+  } else {
+    this->null_count = value.is_valid ? 0 : 1;

Review Comment:
   We do, but they have the same semantics as dictionary arrays: the top-level validity refers to the indices, regardless of the underlying dictionary values.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org