You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/15 15:15:45 UTC

[arrow-rs] branch master updated: Lazily materialize the null buffer builder of boolean builder (#2073)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 7fff23ff2 Lazily materialize the null buffer builder of boolean builder (#2073)
7fff23ff2 is described below

commit 7fff23ff214e7237032c1815832ee959cb3a7cc2
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Fri Jul 15 23:15:38 2022 +0800

    Lazily materialize the null buffer builder of boolean builder (#2073)
    
    * lazily materialize the null buffer builder
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add docs
    
    Signed-off-by: remzi <13...@gmail.com>
---
 arrow/src/array/builder/boolean_builder.rs   | 89 ++++++++++++++++++++++++----
 arrow/src/array/builder/primitive_builder.rs | 27 ---------
 2 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/arrow/src/array/builder/boolean_builder.rs b/arrow/src/array/builder/boolean_builder.rs
index 98acb641b..d0063e566 100644
--- a/arrow/src/array/builder/boolean_builder.rs
+++ b/arrow/src/array/builder/boolean_builder.rs
@@ -60,7 +60,9 @@ use super::BooleanBufferBuilder;
 #[derive(Debug)]
 pub struct BooleanBuilder {
     values_builder: BooleanBufferBuilder,
-    bitmap_builder: BooleanBufferBuilder,
+    /// We only materialize the builder when we add `false`.
+    /// This optimization is **very** important for the performance.
+    bitmap_builder: Option<BooleanBufferBuilder>,
 }
 
 impl BooleanBuilder {
@@ -68,7 +70,7 @@ impl BooleanBuilder {
     pub fn new(capacity: usize) -> Self {
         Self {
             values_builder: BooleanBufferBuilder::new(capacity),
-            bitmap_builder: BooleanBufferBuilder::new(capacity),
+            bitmap_builder: None,
         }
     }
 
@@ -80,15 +82,18 @@ impl BooleanBuilder {
     /// Appends a value of type `T` into the builder
     #[inline]
     pub fn append_value(&mut self, v: bool) -> Result<()> {
-        self.bitmap_builder.append(true);
         self.values_builder.append(v);
+        if let Some(b) = self.bitmap_builder.as_mut() {
+            b.append(true)
+        }
         Ok(())
     }
 
     /// Appends a null slot into the builder
     #[inline]
     pub fn append_null(&mut self) -> Result<()> {
-        self.bitmap_builder.append(false);
+        self.materialize_bitmap_builder();
+        self.bitmap_builder.as_mut().unwrap().append(false);
         self.values_builder.advance(1);
         Ok(())
     }
@@ -106,7 +111,9 @@ impl BooleanBuilder {
     /// Appends a slice of type `T` into the builder
     #[inline]
     pub fn append_slice(&mut self, v: &[bool]) -> Result<()> {
-        self.bitmap_builder.append_n(v.len(), true);
+        if let Some(b) = self.bitmap_builder.as_mut() {
+            b.append_n(v.len(), true)
+        }
         self.values_builder.append_slice(v);
         Ok(())
     }
@@ -115,28 +122,43 @@ impl BooleanBuilder {
     #[inline]
     pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<()> {
         if values.len() != is_valid.len() {
-            return Err(ArrowError::InvalidArgumentError(
+            Err(ArrowError::InvalidArgumentError(
                 "Value and validity lengths must be equal".to_string(),
-            ));
+            ))
+        } else {
+            is_valid
+                .iter()
+                .any(|v| !*v)
+                .then(|| self.materialize_bitmap_builder());
+            if let Some(b) = self.bitmap_builder.as_mut() {
+                b.append_slice(is_valid)
+            }
+            self.values_builder.append_slice(values);
+            Ok(())
         }
-        self.bitmap_builder.append_slice(is_valid);
-        self.values_builder.append_slice(values);
-        Ok(())
     }
 
     /// Builds the [BooleanArray] and reset this builder.
     pub fn finish(&mut self) -> BooleanArray {
         let len = self.len();
-        let null_bit_buffer = self.bitmap_builder.finish();
-        let null_count = len - null_bit_buffer.count_set_bits();
+        let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish());
         let builder = ArrayData::builder(DataType::Boolean)
             .len(len)
             .add_buffer(self.values_builder.finish())
-            .null_bit_buffer((null_count > 0).then(|| null_bit_buffer));
+            .null_bit_buffer(null_bit_buffer);
 
         let array_data = unsafe { builder.build_unchecked() };
         BooleanArray::from(array_data)
     }
+
+    fn materialize_bitmap_builder(&mut self) {
+        if self.bitmap_builder.is_none() {
+            let mut b = BooleanBufferBuilder::new(0);
+            b.reserve(self.values_builder.capacity());
+            b.append_n(self.values_builder.len(), true);
+            self.bitmap_builder = Some(b);
+        }
+    }
 }
 
 impl ArrayBuilder for BooleanBuilder {
@@ -174,6 +196,32 @@ impl ArrayBuilder for BooleanBuilder {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::{array::Array, buffer::Buffer};
+
+    #[test]
+    fn test_boolean_array_builder() {
+        // 00000010 01001000
+        let buf = Buffer::from([72_u8, 2_u8]);
+        let mut builder = BooleanArray::builder(10);
+        for i in 0..10 {
+            if i == 3 || i == 6 || i == 9 {
+                builder.append_value(true).unwrap();
+            } else {
+                builder.append_value(false).unwrap();
+            }
+        }
+
+        let arr = builder.finish();
+        assert_eq!(&buf, arr.values());
+        assert_eq!(10, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
+        for i in 0..10 {
+            assert!(!arr.is_null(i));
+            assert!(arr.is_valid(i));
+            assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i)
+        }
+    }
 
     #[test]
     fn test_boolean_array_builder_append_slice() {
@@ -200,4 +248,19 @@ mod tests {
 
         assert_eq!(arr1, arr2);
     }
+
+    #[test]
+    fn test_boolean_array_builder_no_null() {
+        let mut builder = BooleanArray::builder(0);
+        builder.append_option(Some(true)).unwrap();
+        builder.append_value(false).unwrap();
+        builder.append_slice(&[true, false, true]).unwrap();
+        builder
+            .append_values(&[false, false, true], &[true, true, true])
+            .unwrap();
+
+        let array = builder.finish();
+        assert_eq!(0, array.null_count());
+        assert!(array.data().null_buffer().is_none());
+    }
 }
diff --git a/arrow/src/array/builder/primitive_builder.rs b/arrow/src/array/builder/primitive_builder.rs
index 2f5eeac73..ec1b408ed 100644
--- a/arrow/src/array/builder/primitive_builder.rs
+++ b/arrow/src/array/builder/primitive_builder.rs
@@ -216,12 +216,10 @@ mod tests {
     use super::*;
 
     use crate::array::Array;
-    use crate::array::BooleanArray;
     use crate::array::Date32Array;
     use crate::array::Int32Array;
     use crate::array::Int32Builder;
     use crate::array::TimestampSecondArray;
-    use crate::buffer::Buffer;
 
     #[test]
     fn test_primitive_array_builder_i32() {
@@ -303,31 +301,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_primitive_array_builder_bool() {
-        // 00000010 01001000
-        let buf = Buffer::from([72_u8, 2_u8]);
-        let mut builder = BooleanArray::builder(10);
-        for i in 0..10 {
-            if i == 3 || i == 6 || i == 9 {
-                builder.append_value(true).unwrap();
-            } else {
-                builder.append_value(false).unwrap();
-            }
-        }
-
-        let arr = builder.finish();
-        assert_eq!(&buf, arr.values());
-        assert_eq!(10, arr.len());
-        assert_eq!(0, arr.offset());
-        assert_eq!(0, arr.null_count());
-        for i in 0..10 {
-            assert!(!arr.is_null(i));
-            assert!(arr.is_valid(i));
-            assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i)
-        }
-    }
-
     #[test]
     fn test_primitive_array_builder_append_option() {
         let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]);