You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/21 19:03:41 UTC

[arrow-rs] branch master updated: Add GenericByteBuilder (#2969) (#3122)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b3dbe7011 Add GenericByteBuilder (#2969) (#3122)
b3dbe7011 is described below

commit b3dbe7011ace86c7ab46b1a15388f128bffb5f6d
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Mon Nov 21 19:03:35 2022 +0000

    Add GenericByteBuilder (#2969) (#3122)
    
    * Add GenericByteBuilder (#2969)
    
    * RAT
    
    * Apply suggestions from code review
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 ..._binary_builder.rs => generic_bytes_builder.rs} | 171 +++++++++++++----
 arrow-array/src/builder/generic_string_builder.rs  | 204 ---------------------
 arrow-array/src/builder/mod.rs                     |   6 +-
 3 files changed, 135 insertions(+), 246 deletions(-)

diff --git a/arrow-array/src/builder/generic_binary_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs
similarity index 58%
rename from arrow-array/src/builder/generic_binary_builder.rs
rename to arrow-array/src/builder/generic_bytes_builder.rs
index c806bebf9..fa0a31ad7 100644
--- a/arrow-array/src/builder/generic_binary_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_builder.rs
@@ -17,34 +17,35 @@
 
 use crate::builder::null_buffer_builder::NullBufferBuilder;
 use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
-use crate::{ArrayRef, GenericBinaryArray, OffsetSizeTrait};
+use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
+use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
+use arrow_buffer::ArrowNativeType;
 use arrow_data::ArrayDataBuilder;
 use std::any::Any;
 use std::sync::Arc;
 
-///  Array builder for [`GenericBinaryArray`]
-#[derive(Debug)]
-pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
+///  Array builder for [`GenericByteArray`]
+pub struct GenericByteBuilder<T: ByteArrayType> {
     value_builder: UInt8BufferBuilder,
-    offsets_builder: BufferBuilder<OffsetSize>,
+    offsets_builder: BufferBuilder<T::Offset>,
     null_buffer_builder: NullBufferBuilder,
 }
 
-impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
-    /// Creates a new [`GenericBinaryBuilder`].
+impl<T: ByteArrayType> GenericByteBuilder<T> {
+    /// Creates a new [`GenericByteBuilder`].
     pub fn new() -> Self {
         Self::with_capacity(1024, 1024)
     }
 
-    /// Creates a new [`GenericBinaryBuilder`].
+    /// Creates a new [`GenericByteBuilder`].
     ///
     /// - `item_capacity` is the number of items to pre-allocate.
     ///   The size of the preallocated buffer of offsets is the number of items plus one.
-    /// - `data_capacity` is the total number of bytes of string data to pre-allocate
+    /// - `data_capacity` is the total number of bytes of data to pre-allocate
     ///   (for all items, not per item).
     pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        let mut offsets_builder = BufferBuilder::<OffsetSize>::new(item_capacity + 1);
-        offsets_builder.append(OffsetSize::zero());
+        let mut offsets_builder = BufferBuilder::<T::Offset>::new(item_capacity + 1);
+        offsets_builder.append(T::Offset::from_usize(0).unwrap());
         Self {
             value_builder: UInt8BufferBuilder::new(data_capacity),
             offsets_builder,
@@ -52,13 +53,22 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
         }
     }
 
-    /// Appends a byte slice into the builder.
+    /// Appends a value into the builder.
     #[inline]
-    pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
-        self.value_builder.append_slice(value.as_ref());
+    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
+        self.value_builder.append_slice(value.as_ref().as_ref());
         self.null_buffer_builder.append(true);
         self.offsets_builder
-            .append(OffsetSize::from_usize(self.value_builder.len()).unwrap());
+            .append(T::Offset::from_usize(self.value_builder.len()).unwrap());
+    }
+
+    /// Append an `Option` value into the builder.
+    #[inline]
+    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
+        match value {
+            None => self.append_null(),
+            Some(v) => self.append_value(v),
+        };
     }
 
     /// Append a null value into the builder.
@@ -66,21 +76,22 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
     pub fn append_null(&mut self) {
         self.null_buffer_builder.append(false);
         self.offsets_builder
-            .append(OffsetSize::from_usize(self.value_builder.len()).unwrap());
+            .append(T::Offset::from_usize(self.value_builder.len()).unwrap());
     }
 
-    /// Builds the [`GenericBinaryArray`] and reset this builder.
-    pub fn finish(&mut self) -> GenericBinaryArray<OffsetSize> {
-        let array_type = GenericBinaryArray::<OffsetSize>::DATA_TYPE;
+    /// Builds the [`GenericByteArray`] and reset this builder.
+    pub fn finish(&mut self) -> GenericByteArray<T> {
+        let array_type = T::DATA_TYPE;
         let array_builder = ArrayDataBuilder::new(array_type)
             .len(self.len())
             .add_buffer(self.offsets_builder.finish())
             .add_buffer(self.value_builder.finish())
             .null_bit_buffer(self.null_buffer_builder.finish());
 
-        self.offsets_builder.append(OffsetSize::zero());
+        self.offsets_builder
+            .append(T::Offset::from_usize(0).unwrap());
         let array_data = unsafe { array_builder.build_unchecked() };
-        GenericBinaryArray::<OffsetSize>::from(array_data)
+        GenericByteArray::from(array_data)
     }
 
     /// Returns the current values buffer as a slice
@@ -89,18 +100,44 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
     }
 
     /// Returns the current offsets buffer as a slice
-    pub fn offsets_slice(&self) -> &[OffsetSize] {
+    pub fn offsets_slice(&self) -> &[T::Offset] {
         self.offsets_builder.as_slice()
     }
 }
 
-impl<OffsetSize: OffsetSizeTrait> Default for GenericBinaryBuilder<OffsetSize> {
+impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
+        f.debug_struct("")
+            .field("value_builder", &self.value_builder)
+            .field("offsets_builder", &self.offsets_builder)
+            .field("null_buffer_builder", &self.null_buffer_builder)
+            .finish()
+    }
+}
+
+impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
     fn default() -> Self {
         Self::new()
     }
 }
 
-impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
+impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
+    /// Returns the number of binary slots in the builder
+    fn len(&self) -> usize {
+        self.null_buffer_builder.len()
+    }
+
+    /// Returns whether the number of binary slots is zero
+    fn is_empty(&self) -> bool {
+        self.null_buffer_builder.is_empty()
+    }
+
+    /// Builds the array and reset this builder.
+    fn finish(&mut self) -> ArrayRef {
+        Arc::new(self.finish())
+    }
+
     /// Returns the builder as a non-mutable `Any` reference.
     fn as_any(&self) -> &dyn Any {
         self
@@ -115,27 +152,19 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSi
     fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
         self
     }
+}
 
-    /// Returns the number of binary slots in the builder
-    fn len(&self) -> usize {
-        self.null_buffer_builder.len()
-    }
-
-    /// Returns whether the number of binary slots is zero
-    fn is_empty(&self) -> bool {
-        self.null_buffer_builder.is_empty()
-    }
+///  Array builder for [`GenericStringArray`][crate::GenericStringArray]
+pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
 
-    /// Builds the array and reset this builder.
-    fn finish(&mut self) -> ArrayRef {
-        Arc::new(self.finish())
-    }
-}
+///  Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
+pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
 
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::array::{Array, OffsetSizeTrait};
+    use crate::GenericStringArray;
 
     fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
         let mut builder = GenericBinaryBuilder::<O>::new();
@@ -230,4 +259,70 @@ mod tests {
     fn test_large_binary_builder_reset() {
         _test_generic_binary_builder_reset::<i64>()
     }
+
+    fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
+        let mut builder = GenericStringBuilder::<O>::new();
+        let owned = "arrow".to_owned();
+
+        builder.append_value("hello");
+        builder.append_value("");
+        builder.append_value(&owned);
+        builder.append_null();
+        builder.append_option(Some("rust"));
+        builder.append_option(None::<&str>);
+        builder.append_option(None::<String>);
+        assert_eq!(7, builder.len());
+
+        assert_eq!(
+            GenericStringArray::<O>::from(vec![
+                Some("hello"),
+                Some(""),
+                Some("arrow"),
+                None,
+                Some("rust"),
+                None,
+                None
+            ]),
+            builder.finish()
+        );
+    }
+
+    #[test]
+    fn test_string_array_builder() {
+        _test_generic_string_array_builder::<i32>()
+    }
+
+    #[test]
+    fn test_large_string_array_builder() {
+        _test_generic_string_array_builder::<i64>()
+    }
+
+    fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
+        let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
+
+        builder.append_value("hello");
+        builder.append_value("rust");
+        builder.append_null();
+
+        builder.finish();
+        assert!(builder.is_empty());
+        assert_eq!(&[O::zero()], builder.offsets_slice());
+
+        builder.append_value("arrow");
+        builder.append_value("parquet");
+        let arr = builder.finish();
+        // array should not have null buffer because there is not `null` value.
+        assert_eq!(None, arr.data().null_buffer());
+        assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
+    }
+
+    #[test]
+    fn test_string_array_builder_finish() {
+        _test_generic_string_array_builder_finish::<i32>()
+    }
+
+    #[test]
+    fn test_large_string_array_builder_finish() {
+        _test_generic_string_array_builder_finish::<i64>()
+    }
 }
diff --git a/arrow-array/src/builder/generic_string_builder.rs b/arrow-array/src/builder/generic_string_builder.rs
deleted file mode 100644
index f766b6f55..000000000
--- a/arrow-array/src/builder/generic_string_builder.rs
+++ /dev/null
@@ -1,204 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::builder::{ArrayBuilder, GenericBinaryBuilder};
-use crate::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
-use std::any::Any;
-use std::sync::Arc;
-
-///  Array builder for [`GenericStringArray`]
-#[derive(Debug)]
-pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
-    builder: GenericBinaryBuilder<OffsetSize>,
-}
-
-impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
-    /// Creates a new [`GenericStringBuilder`].
-    pub fn new() -> Self {
-        Self {
-            builder: GenericBinaryBuilder::new(),
-        }
-    }
-
-    /// Creates a new [`GenericStringBuilder`].
-    ///
-    /// - `item_capacity` is the number of items to pre-allocate.
-    ///   The size of the preallocated buffer of offsets is the number of items plus one.
-    /// - `data_capacity` is the total number of bytes of string data to pre-allocate
-    ///   (for all items, not per item).
-    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        Self {
-            builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity),
-        }
-    }
-
-    /// Appends a string into the builder.
-    #[inline]
-    pub fn append_value(&mut self, value: impl AsRef<str>) {
-        self.builder.append_value(value.as_ref().as_bytes());
-    }
-
-    /// Append a null value into the builder.
-    #[inline]
-    pub fn append_null(&mut self) {
-        self.builder.append_null()
-    }
-
-    /// Append an `Option` value into the builder.
-    #[inline]
-    pub fn append_option(&mut self, value: Option<impl AsRef<str>>) {
-        match value {
-            None => self.append_null(),
-            Some(v) => self.append_value(v),
-        };
-    }
-
-    /// Builds the [`GenericStringArray`] and reset this builder.
-    pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
-        let t = GenericStringArray::<OffsetSize>::DATA_TYPE;
-        let v = self.builder.finish();
-        let builder = v.into_data().into_builder().data_type(t);
-
-        // SAFETY:
-        // Data must be UTF-8 as only support writing `str`
-        // Offsets must be valid as guaranteed by `GenericBinaryBuilder`
-        let data = unsafe { builder.build_unchecked() };
-        data.into()
-    }
-
-    /// Returns the current values buffer as a slice.
-    pub fn values_slice(&self) -> &[u8] {
-        self.builder.values_slice()
-    }
-
-    /// Returns the current offsets buffer as a slice.
-    pub fn offsets_slice(&self) -> &[OffsetSize] {
-        self.builder.offsets_slice()
-    }
-}
-
-impl<OffsetSize: OffsetSizeTrait> Default for GenericStringBuilder<OffsetSize> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSize> {
-    /// Returns the builder as a non-mutable `Any` reference.
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    /// Returns the builder as a mutable `Any` reference.
-    fn as_any_mut(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    /// Returns the boxed builder as a box of `Any`.
-    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
-        self
-    }
-
-    /// Returns the number of array slots in the builder
-    fn len(&self) -> usize {
-        self.builder.len()
-    }
-
-    /// Returns whether the number of array slots is zero
-    fn is_empty(&self) -> bool {
-        self.builder.is_empty()
-    }
-
-    /// Builds the array and reset this builder.
-    fn finish(&mut self) -> ArrayRef {
-        let a = GenericStringBuilder::<OffsetSize>::finish(self);
-        Arc::new(a)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::array::{Array, OffsetSizeTrait};
-    use crate::builder::ArrayBuilder;
-
-    fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
-        let mut builder = GenericStringBuilder::<O>::new();
-        let owned = "arrow".to_owned();
-
-        builder.append_value("hello");
-        builder.append_value("");
-        builder.append_value(&owned);
-        builder.append_null();
-        builder.append_option(Some("rust"));
-        builder.append_option(None::<&str>);
-        builder.append_option(None::<String>);
-        assert_eq!(7, builder.len());
-
-        assert_eq!(
-            GenericStringArray::<O>::from(vec![
-                Some("hello"),
-                Some(""),
-                Some("arrow"),
-                None,
-                Some("rust"),
-                None,
-                None
-            ]),
-            builder.finish()
-        );
-    }
-
-    #[test]
-    fn test_string_array_builder() {
-        _test_generic_string_array_builder::<i32>()
-    }
-
-    #[test]
-    fn test_large_string_array_builder() {
-        _test_generic_string_array_builder::<i64>()
-    }
-
-    fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
-        let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
-
-        builder.append_value("hello");
-        builder.append_value("rust");
-        builder.append_null();
-
-        builder.finish();
-        assert!(builder.is_empty());
-        assert_eq!(&[O::zero()], builder.offsets_slice());
-
-        builder.append_value("arrow");
-        builder.append_value("parquet");
-        let arr = builder.finish();
-        // array should not have null buffer because there is not `null` value.
-        assert_eq!(None, arr.data().null_buffer());
-        assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
-    }
-
-    #[test]
-    fn test_string_array_builder_finish() {
-        _test_generic_string_array_builder_finish::<i32>()
-    }
-
-    #[test]
-    fn test_large_string_array_builder_finish() {
-        _test_generic_string_array_builder_finish::<i64>()
-    }
-}
diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs
index 5edf011d7..a5c1e3d4b 100644
--- a/arrow-array/src/builder/mod.rs
+++ b/arrow-array/src/builder/mod.rs
@@ -28,12 +28,10 @@ mod fixed_size_binary_builder;
 pub use fixed_size_binary_builder::*;
 mod fixed_size_list_builder;
 pub use fixed_size_list_builder::*;
-mod generic_binary_builder;
-pub use generic_binary_builder::*;
+mod generic_bytes_builder;
+pub use generic_bytes_builder::*;
 mod generic_list_builder;
 pub use generic_list_builder::*;
-mod generic_string_builder;
-pub use generic_string_builder::*;
 mod map_builder;
 pub use map_builder::*;
 mod null_buffer_builder;