You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/21 20:13:31 UTC
[arrow-rs] branch master updated: Update `GenericBinaryBuilder` to use buffer builders directly. (#2117)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 1805b30e6 Update `GenericBinaryBuilder` to use buffer builders directly. (#2117)
1805b30e6 is described below
commit 1805b30e620b05bd050a9f2dfcf5ef41e05977ea
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Fri Jul 22 04:13:27 2022 +0800
Update `GenericBinaryBuilder` to use buffer builders directly. (#2117)
* update the struct of binary builder
Signed-off-by: remzi <13...@gmail.com>
* refine the test code
Signed-off-by: remzi <13...@gmail.com>
* more tests
Signed-off-by: remzi <13...@gmail.com>
* fix nits
Signed-off-by: remzi <13...@gmail.com>
---
arrow/src/array/builder/generic_binary_builder.rs | 225 +++++++++++++---------
1 file changed, 135 insertions(+), 90 deletions(-)
diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs
index 52d51fb18..54c1855a1 100644
--- a/arrow/src/array/builder/generic_binary_builder.rs
+++ b/arrow/src/array/builder/generic_binary_builder.rs
@@ -15,63 +15,72 @@
// specific language governing permissions and limitations
// under the License.
-use crate::array::{
- ArrayBuilder, ArrayRef, GenericBinaryArray, GenericListBuilder, OffsetSizeTrait,
- UInt8Builder,
+use crate::{
+ array::{
+ ArrayBuilder, ArrayDataBuilder, ArrayRef, GenericBinaryArray, OffsetSizeTrait,
+ UInt8BufferBuilder,
+ },
+ datatypes::DataType,
};
use std::any::Any;
use std::sync::Arc;
-/// Array builder for `BinaryArray`
+use super::{BooleanBufferBuilder, BufferBuilder};
+
+/// Array builder for [`GenericBinaryArray`]
#[derive(Debug)]
pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
- builder: GenericListBuilder<OffsetSize, UInt8Builder>,
+ value_builder: UInt8BufferBuilder,
+ offsets_builder: BufferBuilder<OffsetSize>,
+ null_buffer_builder: BooleanBufferBuilder,
}
impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
- /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values
- /// array
+ /// Creates a new [`GenericBinaryBuilder`].
+ /// `capacity` is the number of bytes in the values array.
pub fn new(capacity: usize) -> Self {
- let values_builder = UInt8Builder::new(capacity);
+ let mut offsets_builder = BufferBuilder::<OffsetSize>::new(1024);
+ offsets_builder.append(OffsetSize::zero());
Self {
- builder: GenericListBuilder::new(values_builder),
+ value_builder: UInt8BufferBuilder::new(capacity),
+ offsets_builder,
+ null_buffer_builder: BooleanBufferBuilder::new(1024),
}
}
- /// Appends a single byte value into the builder's values array.
- ///
- /// Note, when appending individual byte values you must call `append` to delimit each
- /// distinct list value.
- #[inline]
- pub fn append_byte(&mut self, value: u8) {
- self.builder.values().append_value(value);
- }
-
/// Appends a byte slice into the builder.
- ///
- /// Automatically calls the `append` method to delimit the slice appended in as a
- /// distinct array element.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
- self.builder.values().append_slice(value.as_ref());
- self.builder.append(true);
- }
-
- /// Finish the current variable-length list array slot.
- #[inline]
- pub fn append(&mut self, is_valid: bool) {
- self.builder.append(is_valid)
+ self.value_builder.append_slice(value.as_ref());
+ self.null_buffer_builder.append(true);
+ self.offsets_builder
+ .append(OffsetSize::from_usize(self.value_builder.len()).unwrap());
}
/// Append a null value to the array.
#[inline]
pub fn append_null(&mut self) {
- self.append(false)
+ self.null_buffer_builder.append(false);
+ self.offsets_builder
+ .append(OffsetSize::from_usize(self.value_builder.len()).unwrap());
}
- /// Builds the `BinaryArray` and reset this builder.
+ /// Builds the [`GenericBinaryArray`] and reset this builder.
pub fn finish(&mut self) -> GenericBinaryArray<OffsetSize> {
- GenericBinaryArray::<OffsetSize>::from(self.builder.finish())
+ let array_type = if OffsetSize::IS_LARGE {
+ DataType::LargeBinary
+ } else {
+ DataType::Binary
+ };
+ let array_builder = ArrayDataBuilder::new(array_type)
+ .len(self.len())
+ .add_buffer(self.offsets_builder.finish())
+ .add_buffer(self.value_builder.finish())
+ .null_bit_buffer(Some(self.null_buffer_builder.finish()));
+
+ self.offsets_builder.append(OffsetSize::zero());
+ let array_data = unsafe { array_builder.build_unchecked() };
+ GenericBinaryArray::<OffsetSize>::from(array_data)
}
}
@@ -91,14 +100,14 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSi
self
}
- /// Returns the number of array slots in the builder
+ /// Returns the number of binary slots in the builder
fn len(&self) -> usize {
- self.builder.len()
+ self.null_buffer_builder.len()
}
- /// Returns whether the number of array slots is zero
+ /// Returns whether the number of binary slots is zero
fn is_empty(&self) -> bool {
- self.builder.is_empty()
+ self.null_buffer_builder.is_empty()
}
/// Builds the array and reset this builder.
@@ -109,64 +118,100 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSi
#[cfg(test)]
mod tests {
- use crate::array::builder::{BinaryBuilder, LargeBinaryBuilder};
- use crate::array::Array;
+ use super::*;
+ use crate::array::{Array, OffsetSizeTrait};
+
+ fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
+ let mut builder = GenericBinaryBuilder::<O>::new(20);
+
+ builder.append_value(b"hello");
+ builder.append_value(b"");
+ builder.append_null();
+ builder.append_value(b"rust");
+
+ let array = builder.finish();
+
+ assert_eq!(4, array.len());
+ assert_eq!(1, array.null_count());
+ assert_eq!(b"hello", array.value(0));
+ assert_eq!([] as [u8; 0], array.value(1));
+ assert!(array.is_null(2));
+ assert_eq!(b"rust", array.value(3));
+ assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
+ assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
+ }
+
+ #[test]
+ fn test_binary_builder() {
+ _test_generic_binary_builder::<i32>()
+ }
+
+ #[test]
+ fn test_large_binary_builder() {
+ _test_generic_binary_builder::<i64>()
+ }
+
+ fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
+ let mut builder = GenericBinaryBuilder::<O>::new(10);
+ builder.append_null();
+ builder.append_null();
+ builder.append_null();
+ assert_eq!(3, builder.len());
+ assert!(!builder.is_empty());
+
+ let array = builder.finish();
+ assert_eq!(3, array.null_count());
+ assert_eq!(3, array.len());
+ assert!(array.is_null(0));
+ assert!(array.is_null(1));
+ assert!(array.is_null(2));
+ }
+
+ #[test]
+ fn test_binary_builder_all_nulls() {
+ _test_generic_binary_builder_all_nulls::<i32>()
+ }
+
+ #[test]
+ fn test_large_binary_builder_all_nulls() {
+ _test_generic_binary_builder_all_nulls::<i64>()
+ }
+
+ fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
+ let mut builder = GenericBinaryBuilder::<O>::new(20);
+
+ builder.append_value(b"hello");
+ builder.append_value(b"");
+ builder.append_null();
+ builder.append_value(b"rust");
+ builder.finish();
+
+ assert!(builder.is_empty());
+
+ builder.append_value(b"parquet");
+ builder.append_null();
+ builder.append_value(b"arrow");
+ builder.append_value(b"");
+ let array = builder.finish();
+
+ assert_eq!(4, array.len());
+ assert_eq!(1, array.null_count());
+ assert_eq!(b"parquet", array.value(0));
+ assert!(array.is_null(1));
+ assert_eq!(b"arrow", array.value(2));
+ assert_eq!(b"", array.value(1));
+ assert_eq!(O::zero(), array.value_offsets()[0]);
+ assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
+ assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
+ }
#[test]
- fn test_binary_array_builder() {
- let mut builder = BinaryBuilder::new(20);
-
- builder.append_byte(b'h');
- builder.append_byte(b'e');
- builder.append_byte(b'l');
- builder.append_byte(b'l');
- builder.append_byte(b'o');
- builder.append(true);
- builder.append(true);
- builder.append_byte(b'w');
- builder.append_byte(b'o');
- builder.append_byte(b'r');
- builder.append_byte(b'l');
- builder.append_byte(b'd');
- builder.append(true);
-
- let binary_array = builder.finish();
-
- assert_eq!(3, binary_array.len());
- assert_eq!(0, binary_array.null_count());
- assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
- assert_eq!([] as [u8; 0], binary_array.value(1));
- assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2));
- assert_eq!(5, binary_array.value_offsets()[2]);
- assert_eq!(5, binary_array.value_length(2));
+ fn test_binary_builder_reset() {
+ _test_generic_binary_builder_reset::<i32>()
}
#[test]
- fn test_large_binary_array_builder() {
- let mut builder = LargeBinaryBuilder::new(20);
-
- builder.append_byte(b'h');
- builder.append_byte(b'e');
- builder.append_byte(b'l');
- builder.append_byte(b'l');
- builder.append_byte(b'o');
- builder.append(true);
- builder.append(true);
- builder.append_byte(b'w');
- builder.append_byte(b'o');
- builder.append_byte(b'r');
- builder.append_byte(b'l');
- builder.append_byte(b'd');
- builder.append(true);
-
- let binary_array = builder.finish();
-
- assert_eq!(3, binary_array.len());
- assert_eq!(0, binary_array.null_count());
- assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0));
- assert_eq!([] as [u8; 0], binary_array.value(1));
- assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2));
- assert_eq!(5, binary_array.value_offsets()[2]);
- assert_eq!(5, binary_array.value_length(2));
+ fn test_large_binary_builder_reset() {
+ _test_generic_binary_builder_reset::<i64>()
}
}