You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "jorisvandenbossche (via GitHub)" <gi...@apache.org> on 2023/03/02 11:09:34 UTC

[GitHub] [arrow] jorisvandenbossche commented on a diff in pull request #34408: GH-34361: [C++] work-in-progress: Add skeleton of the new APIs for handling null checks correctly for all types

jorisvandenbossche commented on code in PR #34408:
URL: https://github.com/apache/arrow/pull/34408#discussion_r1122928924


##########
cpp/src/arrow/array/data.h:
##########
@@ -229,15 +256,88 @@ struct ARROW_EXPORT ArrayData {
 
   void SetNullCount(int64_t v) { null_count.store(v); }
 
-  /// \brief Return null count, or compute and set it if it's not known
+  /// \brief Return physical null count, or compute and set it if it's not known
   int64_t GetNullCount() const;
 
+  /// \brief Return true if the data has a validity bitmap and the physical null
+  /// count is known to be non-zero or not yet known.
+  ///
+  /// Note that this is not the same as MayHaveLogicalNulls, which also checks
+  /// for the presence of nulls in child data for types like unions and run-end
+  /// encoded types.
+  ///
+  /// \see HasValidityBitmap
+  /// \see MayHaveLogicalNulls
   bool MayHaveNulls() const {
     // If an ArrayData is slightly malformed it may have kUnknownNullCount set
     // but no buffer
     return null_count.load() != 0 && buffers[0] != NULLPTR;
   }
 
+  /// \brief Return true if the data has a validity bitmap
+  bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
+
+  /// \brief Return true if the validity bitmap may have 0's in it, or if the
+  /// child arrays (in the case of types without a validity bitmap) may have
+  /// nulls
+  ///
+  /// This is not a drop-in replacement for MayHaveNulls, as historically
+  /// MayHaveNulls() has been used to check for the presence of a validity
+  /// bitmap that needs to be checked.
+  ///
+  /// Code that previously used MayHaveNulls() and then dealt with the validity
+  /// bitmap directly can be fixed to handle all types correctly without
+  /// performance degradation when handling most types by adopting
+  /// HasValidityBitmap and MayHaveLogicalNulls.
+  ///
+  /// BEFORE:
+  ///
+  ///   uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
+  ///   for (int64_t i = 0; i < array.length; ++i) {
+  ///     if (validity && !bit_util::GetBit(validity, i)) {
+  ///       continue;  // skip a NULL
+  ///     }
+  ///     ...
+  ///   }
+  ///
+  /// AFTER:
+  ///
+  ///   bool all_valid = !array.MayHaveLogicalNulls();
+  ///   uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
+  ///   for (int64_t i = 0; i < array.length; ++i) {
+  ///     bool is_valid = all_valid ||
+  ///                     (validity && bit_util::GetBit(validity, i)) ||
+  ///                     array.IsValid(i);
+  ///     if (!is_valid) {
+  ///       continue;  // skip a NULL
+  ///     }
+  ///     ...
+  ///   }
+  bool MayHaveLogicalNulls() const {
+    if (buffers[0] != NULLPTR) {
+      return null_count.load() != 0;
+    }
+    const auto t = type->id();
+    if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
+      return internal::UnionMayHaveLogicalNulls(*this);
+    }
+    if (t == Type::RUN_END_ENCODED) {
+      return internal::RunEndEncodedMayHaveLogicalNulls(*this);
+    }
+    return false;
+  }
+
+  /// \brief Computes the logical null count for arrays of all types including
+  /// those that do not have a validity bitmap like union and run-end encoded
+  /// arrays
+  ///
+  /// If the array has a validity bitmap, this function behaves the same as
+  /// GetNullCount. For types that have no validity bitmap, this function will
+  /// recompute the null count every time it is called.

Review Comment:
   It's not possible to cache this like it is done for `null_count_` ? (can child data be changed that would invalidate this?)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org