You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/11/01 18:31:29 UTC

[arrow] branch master updated: ARROW-3347: [Rust] Implement PrimitiveArrayBuilder

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 919119f  ARROW-3347: [Rust] Implement PrimitiveArrayBuilder
919119f is described below

commit 919119fd62eb27195eca5a1e5d131423b55eef31
Author: Paddy Horan <pa...@hotmail.com>
AuthorDate: Thu Nov 1 19:31:05 2018 +0100

    ARROW-3347: [Rust] Implement PrimitiveArrayBuilder
    
    Adds builder for `PrimitiveArray`.
    
    @sunchao has mentioned that it's unfortunate that we have to rely on macros to define the `impl` block for types implementing `ArrowPrimitiveType`.  When specialization lands in stable we can remove much/all of this but for now we have to rely on macros.
    
    This implementation mostly focuses on being correct.  However, maybe we should add `push_value_raw` and `push_null_raw` and allow the caller to handle updating the bitmap (i.e. avoid checking if the bitmap `is_some` on every `push`)?  If so, I can add this as a separate PR (along with other optimizations).
    
    Author: Paddy Horan <pa...@hotmail.com>
    
    Closes #2858 from paddyhoran/ARROW-3347 and squashes the following commits:
    
    13b8b7d4 <Paddy Horan> Made `advance` private
    c4d4e377 <Paddy Horan> Addressed review comments.
    82e7f4e9 <Paddy Horan> Fixed lint issues
    494397ef <Paddy Horan> Added push_slice
    1dae2a44 <Paddy Horan> Updated docs
    aa51bf9b <Paddy Horan> Initial implementation
---
 rust/src/array.rs   |  11 +++
 rust/src/builder.rs | 226 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 235 insertions(+), 2 deletions(-)

diff --git a/rust/src/array.rs b/rust/src/array.rs
index 0ee792c..0144c64 100644
--- a/rust/src/array.rs
+++ b/rust/src/array.rs
@@ -24,6 +24,7 @@ use std::sync::Arc;
 
 use array_data::*;
 use buffer::*;
+use builder::PrimitiveArrayBuilder;
 use datatypes::*;
 use memory;
 use util::bit_util;
@@ -194,6 +195,11 @@ macro_rules! def_primitive_array {
                 }
                 n
             }
+
+            // Returns a new primitive array builder
+            pub fn builder(capacity: i64) -> PrimitiveArrayBuilder<$native_ty> {
+                PrimitiveArrayBuilder::<$native_ty>::new(capacity)
+            }
         }
 
         /// Constructs a primitive array from a vector. Should only be used for testing.
@@ -311,6 +317,11 @@ impl PrimitiveArray<bool> {
         assert!(offset < self.data.len());
         unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset as usize) }
     }
+
+    // Returns a new primitive array builder
+    pub fn builder(capacity: i64) -> PrimitiveArrayBuilder<bool> {
+        PrimitiveArrayBuilder::<bool>::new(capacity)
+    }
 }
 
 /// Constructs a boolean array from a vector. Should only be used for testing.
diff --git a/rust/src/builder.rs b/rust/src/builder.rs
index 90ab606..187e153 100644
--- a/rust/src/builder.rs
+++ b/rust/src/builder.rs
@@ -22,8 +22,10 @@ use std::io::Write;
 use std::marker::PhantomData;
 use std::mem;
 
+use array::PrimitiveArray;
+use array_data::ArrayData;
 use buffer::{Buffer, MutableBuffer};
-use datatypes::{ArrowPrimitiveType, ToByteSlice};
+use datatypes::{ArrowPrimitiveType, DataType, ToByteSlice};
 use error::{ArrowError, Result};
 use util::bit_util;
 
@@ -55,6 +57,14 @@ macro_rules! impl_buffer_builder {
                 self.len
             }
 
+            // Advances the `len` of the underlying `Buffer` by `i` slots of type T
+            fn advance(&mut self, i: i64) -> Result<()> {
+                let new_buffer_len = (self.len + i) as usize * mem::size_of::<$native_ty>();
+                self.buffer.resize(new_buffer_len)?;
+                self.len += i;
+                Ok(())
+            }
+
             /// Returns the current capacity of the builder (number of elements)
             pub fn capacity(&self) -> i64 {
                 let byte_capacity = self.buffer.capacity();
@@ -136,7 +146,15 @@ impl BufferBuilder<bool> {
         self.len
     }
 
-    /// Returns the current capacity of the builder (number of elements).
+    // Advances the `len` of the underlying `Buffer` by `i` slots of type T
+    pub fn advance(&mut self, i: i64) -> Result<()> {
+        let new_buffer_len = bit_util::ceil(self.len + i, 8);
+        self.buffer.resize(new_buffer_len as usize)?;
+        self.len += i;
+        Ok(())
+    }
+
+    /// Returns the current capacity of the builder (number of elements)
     pub fn capacity(&self) -> i64 {
         let byte_capacity = self.buffer.capacity() as i64;
         byte_capacity * 8
@@ -188,8 +206,99 @@ impl BufferBuilder<bool> {
     }
 }
 
+///  Array builder for fixed-width primitive types
+pub struct PrimitiveArrayBuilder<T>
+where
+    T: ArrowPrimitiveType,
+{
+    values_builder: BufferBuilder<T>,
+    bitmap_builder: BufferBuilder<bool>,
+}
+
+macro_rules! impl_primitive_array_builder {
+    ($data_ty:path, $native_ty:ident) => {
+        impl PrimitiveArrayBuilder<$native_ty> {
+            /// Creates a new primitive array builder
+            pub fn new(capacity: i64) -> Self {
+                Self {
+                    values_builder: BufferBuilder::<$native_ty>::new(capacity),
+                    bitmap_builder: BufferBuilder::<bool>::new(capacity),
+                }
+            }
+
+            /// Returns the capacity of this builder measured in slots of type `T`
+            pub fn capacity(&self) -> i64 {
+                self.values_builder.capacity()
+            }
+
+            /// Returns the length of this builder measured in slots of type `T`
+            pub fn len(&self) -> i64 {
+                self.values_builder.len()
+            }
+
+            /// Pushes a value of type `T` into the builder
+            pub fn push(&mut self, v: $native_ty) -> Result<()> {
+                self.bitmap_builder.push(true)?;
+                self.values_builder.push(v)?;
+                Ok(())
+            }
+
+            /// Pushes a null slot into the builder
+            pub fn push_null(&mut self) -> Result<()> {
+                self.bitmap_builder.push(false)?;
+                self.values_builder.advance(1)?;
+                Ok(())
+            }
+
+            /// Pushes an `Option<T>` into the builder
+            pub fn push_option(&mut self, v: Option<$native_ty>) -> Result<()> {
+                match v {
+                    None => self.push_null()?,
+                    Some(v) => self.push(v)?,
+                };
+                Ok(())
+            }
+
+            /// Pushes a slice of type `T` into the builder
+            pub fn push_slice(&mut self, v: &[$native_ty]) -> Result<()> {
+                self.bitmap_builder.push_slice(&vec![true; v.len()][..])?;
+                self.values_builder.push_slice(v)?;
+                Ok(())
+            }
+
+            /// Builds the PrimitiveArray
+            pub fn finish(self) -> PrimitiveArray<$native_ty> {
+                let len = self.len();
+                let null_bit_buffer = self.bitmap_builder.finish();
+                let data = ArrayData::builder($data_ty)
+                    .len(len)
+                    .null_count(len - bit_util::count_set_bits(null_bit_buffer.data()))
+                    .add_buffer(self.values_builder.finish())
+                    .null_bit_buffer(null_bit_buffer)
+                    .build();
+                PrimitiveArray::<$native_ty>::from(data)
+            }
+        }
+    };
+}
+
+impl_primitive_array_builder!(DataType::Boolean, bool);
+impl_primitive_array_builder!(DataType::UInt8, u8);
+impl_primitive_array_builder!(DataType::UInt16, u16);
+impl_primitive_array_builder!(DataType::UInt32, u32);
+impl_primitive_array_builder!(DataType::UInt64, u64);
+impl_primitive_array_builder!(DataType::Int8, i8);
+impl_primitive_array_builder!(DataType::Int16, i16);
+impl_primitive_array_builder!(DataType::Int32, i32);
+impl_primitive_array_builder!(DataType::Int64, i64);
+impl_primitive_array_builder!(DataType::Float32, f32);
+impl_primitive_array_builder!(DataType::Float64, f64);
+
 #[cfg(test)]
 mod tests {
+
+    use array::Array;
+
     use super::*;
 
     #[test]
@@ -320,4 +429,117 @@ mod tests {
         assert_eq!(buf.len(), buf2.len());
         assert_eq!(buf.data(), buf2.data());
     }
+
+    #[test]
+    fn test_primitive_array_builder_i32() {
+        let mut builder = PrimitiveArray::<i32>::builder(5);
+        for i in 0..5 {
+            builder.push(i).unwrap();
+        }
+        let arr = builder.finish();
+        assert_eq!(5, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
+        for i in 0..5 {
+            assert!(!arr.is_null(i));
+            assert!(arr.is_valid(i));
+            assert_eq!(i as i32, arr.value(i));
+        }
+    }
+
+    #[test]
+    fn test_primitive_array_builder_bool() {
+        // 00000010 01001000
+        let buf = Buffer::from([72_u8, 2_u8]);
+        let mut builder = PrimitiveArray::<bool>::builder(10);
+        for i in 0..10 {
+            if i == 3 || i == 6 || i == 9 {
+                builder.push(true).unwrap();
+            } else {
+                builder.push(false).unwrap();
+            }
+        }
+
+        let arr = builder.finish();
+        assert_eq!(buf, arr.values());
+        assert_eq!(10, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
+        for i in 0..10 {
+            assert!(!arr.is_null(i));
+            assert!(arr.is_valid(i));
+            assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i)
+        }
+    }
+
+    #[test]
+    fn test_primitive_array_builder_push_option() {
+        let arr1 = PrimitiveArray::<i32>::from(vec![Some(0), None, Some(2), None, Some(4)]);
+
+        let mut builder = PrimitiveArray::<i32>::builder(5);
+        builder.push_option(Some(0)).unwrap();
+        builder.push_option(None).unwrap();
+        builder.push_option(Some(2)).unwrap();
+        builder.push_option(None).unwrap();
+        builder.push_option(Some(4)).unwrap();
+        let arr2 = builder.finish();
+
+        assert_eq!(arr1.len(), arr2.len());
+        assert_eq!(arr1.offset(), arr2.offset());
+        assert_eq!(arr1.null_count(), arr2.null_count());
+        for i in 0..5 {
+            assert_eq!(arr1.is_null(i), arr2.is_null(i));
+            assert_eq!(arr1.is_valid(i), arr2.is_valid(i));
+            if arr1.is_valid(i) {
+                assert_eq!(arr1.value(i), arr2.value(i));
+            }
+        }
+    }
+
+    #[test]
+    fn test_primitive_array_builder_push_null() {
+        let arr1 = PrimitiveArray::<i32>::from(vec![Some(0), Some(2), None, None, Some(4)]);
+
+        let mut builder = PrimitiveArray::<i32>::builder(5);
+        builder.push(0).unwrap();
+        builder.push(2).unwrap();
+        builder.push_null().unwrap();
+        builder.push_null().unwrap();
+        builder.push(4).unwrap();
+        let arr2 = builder.finish();
+
+        assert_eq!(arr1.len(), arr2.len());
+        assert_eq!(arr1.offset(), arr2.offset());
+        assert_eq!(arr1.null_count(), arr2.null_count());
+        for i in 0..5 {
+            assert_eq!(arr1.is_null(i), arr2.is_null(i));
+            assert_eq!(arr1.is_valid(i), arr2.is_valid(i));
+            if arr1.is_valid(i) {
+                assert_eq!(arr1.value(i), arr2.value(i));
+            }
+        }
+    }
+
+    #[test]
+    fn test_primitive_array_builder_push_slice() {
+        let arr1 = PrimitiveArray::<i32>::from(vec![Some(0), Some(2), None, None, Some(4)]);
+
+        let mut builder = PrimitiveArray::<i32>::builder(5);
+        builder.push_slice(&[0, 2]).unwrap();
+        builder.push_null().unwrap();
+        builder.push_null().unwrap();
+        builder.push(4).unwrap();
+        let arr2 = builder.finish();
+
+        assert_eq!(arr1.len(), arr2.len());
+        assert_eq!(arr1.offset(), arr2.offset());
+        assert_eq!(arr1.null_count(), arr2.null_count());
+        for i in 0..5 {
+            assert_eq!(arr1.is_null(i), arr2.is_null(i));
+            assert_eq!(arr1.is_valid(i), arr2.is_valid(i));
+            if arr1.is_valid(i) {
+                assert_eq!(arr1.value(i), arr2.value(i));
+            }
+        }
+    }
 }