You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "westonpace (via GitHub)" <gi...@apache.org> on 2023/04/14 22:40:32 UTC

[GitHub] [arrow] westonpace commented on a diff in pull request #35036: GH-34315: [C++] Correct is_null kernel for Union and RunEndEncoded logical nulls

westonpace commented on code in PR #35036:
URL: https://github.com/apache/arrow/pull/35036#discussion_r1167313989


##########
cpp/src/arrow/array/data.cc:
##########
@@ -304,9 +304,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
 
   Type::type type_id = value.type->id();
 
-  // Populate null count and validity bitmap (only for non-union/null types)
-  this->null_count = value.is_valid ? 0 : 1;
-  if (!is_union(type_id) && type_id != Type::NA) {
+  // Populate null count and validity bitmap
+  if (type_id == Type::NA) {
+    this->null_count = 1;
+  } else if (is_union(type_id) || type_id == Type::RUN_END_ENCODED) {
+    this->null_count = 0;
+  } else {
+    this->null_count = value.is_valid ? 0 : 1;

Review Comment:
   Do we have dictionary scalars?



##########
cpp/src/arrow/compute/kernels/scalar_validity.cc:
##########
@@ -82,6 +84,71 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
   }
 }
 
+static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                          int64_t out_offset) {
+  const auto* sparse_union_type =
+      ::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = sparse_union_type->child_ids()[types[i]];
+    if (span.child_data[child_id].IsNull(i + span.offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                         int64_t out_offset) {
+  const auto* dense_union_type =
+      ::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);      // NOLINT
+  const int32_t* offsets = span.GetValues<int32_t>(2);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = dense_union_type->child_ids()[types[i]];
+    const int32_t offset = offsets[i];
+    if (span.child_data[child_id].IsNull(offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+template <typename RunEndCType>
+void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                           int64_t out_offset) {
+  const auto& values = arrow::ree_util::ValuesArray(span);
+  const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR;

Review Comment:
   Maybe a foolish question.  What happens if we have something like `REE<REE<int>>` or `REE<UNION<...>>`?



##########
cpp/src/arrow/compute/kernels/scalar_validity.cc:
##########
@@ -82,6 +84,71 @@ static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_of
   }
 }
 
+static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                          int64_t out_offset) {
+  const auto* sparse_union_type =
+      ::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = sparse_union_type->child_ids()[types[i]];
+    if (span.child_data[child_id].IsNull(i + span.offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                                         int64_t out_offset) {
+  const auto* dense_union_type =
+      ::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
+  DCHECK_LE(span.child_data.size(), 128);
+
+  const int8_t* types = span.GetValues<int8_t>(1);      // NOLINT
+  const int32_t* offsets = span.GetValues<int32_t>(2);  // NOLINT
+  for (int64_t i = 0; i < span.length; i++) {
+    const int8_t child_id = dense_union_type->child_ids()[types[i]];
+    const int32_t offset = offsets[i];
+    if (span.child_data[child_id].IsNull(offset)) {
+      bit_util::SetBit(out_bitmap, i + out_offset);
+    }
+  }
+}
+
+template <typename RunEndCType>
+void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+                           int64_t out_offset) {
+  const auto& values = arrow::ree_util::ValuesArray(span);
+  const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data : NULLPTR;
+
+  if (!values_bitmap) {
+    return;
+  }
+
+  arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span);
+  auto end = ree_span.end();
+  for (auto it = ree_span.begin(); it != end; ++it) {
+    if (!bit_util::GetBit(values_bitmap, values.offset + it.index_into_array())) {
+      bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset, it.run_length(),
+                          true);
+    }
+  }

Review Comment:
   Just checking my understanding here.  If the function `IsNull` is given an array of type `REE<X>` then the return type is `bool` (and not `REE<bool>`) and so here we are exploding the (smaller) bitmap of the values child into a larger output bitmap?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org