You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/28 11:18:24 UTC

[GitHub] [arrow-rs] alamb commented on a diff in pull request #1589: Fix Null Mask Handling in ArrayData And UnionArray

alamb commented on code in PR #1589:
URL: https://github.com/apache/arrow-rs/pull/1589#discussion_r860755500


##########
arrow/src/array/builder.rs:
##########
@@ -1894,23 +1894,19 @@ struct FieldData {
     values_buffer: Option<MutableBuffer>,
     ///  The number of array slots represented by the buffer
     slots: usize,
-    /// A builder for the bitmap if required (for Sparse Unions)
-    bitmap_builder: Option<BooleanBufferBuilder>,
+    /// A builder for the null bitmap

Review Comment:
   👍 



##########
arrow/src/array/builder.rs:
##########
@@ -2070,39 +2061,13 @@ impl UnionBuilder {
             fields: HashMap::default(),
             type_id_builder: Int8BufferBuilder::new(capacity),
             value_offset_builder: None,
-            bitmap_builder: None,
         }
     }
 
     /// Appends a null to this builder.
     #[inline]
-    pub fn append_null(&mut self) -> Result<()> {
-        if self.bitmap_builder.is_none() {
-            let mut builder = BooleanBufferBuilder::new(self.len + 1);
-            for _ in 0..self.len {
-                builder.append(true);
-            }
-            self.bitmap_builder = Some(builder)
-        }
-        self.bitmap_builder
-            .as_mut()
-            .expect("Cannot be None")
-            .append(false);
-
-        self.type_id_builder.append(i8::default());
-
-        match &mut self.value_offset_builder {
-            // Handle dense union
-            Some(value_offset_builder) => value_offset_builder.append(i32::default()),
-            // Handle sparse union
-            None => {
-                for (_, fd) in self.fields.iter_mut() {
-                    fd.append_null_dynamic()?;
-                }
-            }
-        };
-        self.len += 1;
-        Ok(())
+    pub fn append_null<T: ArrowPrimitiveType>(&mut self, type_name: &str) -> Result<()> {

Review Comment:
   I realize from an implementation perspective why you chose to require appending a null by field name, but it seems kind of a messy API -- what do you think about potentially appending a null to the first child or something by default?
   
   Perhaps something to think about for a follow on PR



##########
arrow/src/array/array_union.rs:
##########
@@ -560,7 +554,7 @@ mod tests {
         builder.append::<Int32Type>("a", 1).unwrap();
         builder.append::<Int64Type>("c", 3).unwrap();
         builder.append::<Int32Type>("a", 10).unwrap();
-        builder.append_null().unwrap();
+        builder.append_null::<Int32Type>("a").unwrap();

Review Comment:
   it is strange to require a field name to append nulls -- I left a comment on how to improve things perhaps in the future



##########
arrow/src/array/equal/fixed_binary.rs:
##########
@@ -39,8 +36,8 @@ pub(super) fn fixed_binary_equal(
     let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..];
     let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..];
 
-    let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len);
-    let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len);
+    let lhs_null_count = count_nulls(lhs.null_buffer(), lhs_start + lhs.offset(), len);

Review Comment:
   it makes a lot more sense to look at the buffer's nulls directly



##########
arrow/src/compute/kernels/filter.rs:
##########
@@ -1707,17 +1738,17 @@ mod tests {
         let mut builder = UnionBuilder::new_sparse(4);
         builder.append::<Int32Type>("A", 1).unwrap();
         builder.append::<Float64Type>("B", 3.2).unwrap();
-        builder.append_null().unwrap();
+        builder.append_null::<Float64Type>("B").unwrap();
         builder.append::<Int32Type>("A", 34).unwrap();
         let array = builder.build().unwrap();
 
         let filter_array = BooleanArray::from(vec![true, false, true, false]);
         let c = filter(&array, &filter_array).unwrap();
         let filtered = c.as_any().downcast_ref::<UnionArray>().unwrap();
 
-        let mut builder = UnionBuilder::new_dense(1);
+        let mut builder = UnionBuilder::new_sparse(2);

Review Comment:
   I assume this was changed because the name of the test is `test_filter_union_array_sparse_with_nulls` but we were using a `dense` array previously



##########
arrow/src/array/data.rs:
##########
@@ -621,6 +621,13 @@ impl ArrayData {
         // Check that the data layout conforms to the spec
         let layout = layout(&self.data_type);
 
+        if !layout.can_contain_null_mask && self.null_bitmap.is_some() {

Review Comment:
   👍 



##########
arrow/src/array/array_union.rs:
##########
@@ -522,29 +516,29 @@ mod tests {
             match i {
                 0 => {
                     let slot = slot.as_any().downcast_ref::<Int32Array>().unwrap();
-                    assert!(!union.is_null(i));
+                    assert!(!slot.is_null(0));

Review Comment:
   https://arrow.apache.org/docs/format/Columnar.html#union-layout
   
   Says
   
   > Unlike other data types, unions do not have their own validity bitmap. Instead, the nullness of each slot is determined exclusively by the child arrays which are composed to create the union.
   
   Which seems consistent 👍 
   
   However, it seems like `UnionArray` doesn't override `is_null` to take this into account.  Filed https://github.com/apache/arrow-rs/issues/1625 to track



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org