You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/28 09:56:47 UTC

[arrow-rs] branch master updated: Let the `StringBuilder` use `BinaryBuilder` (#2181)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9e47779e3 Let the `StringBuilder` use `BinaryBuilder` (#2181)
9e47779e3 is described below

commit 9e47779e3f4667e676d00db108e725d576930fcc
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Thu Jul 28 17:56:41 2022 +0800

    Let the `StringBuilder` use `BinaryBuilder` (#2181)
    
    * update string builder
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * update tests and fmt
    
    Signed-off-by: remzi <13...@gmail.com>
---
 arrow/src/array/array_string.rs                   |  31 +++--
 arrow/src/array/builder/generic_binary_builder.rs |  23 ++++
 arrow/src/array/builder/generic_string_builder.rs | 145 +++++++++-------------
 3 files changed, 108 insertions(+), 91 deletions(-)

diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index df858d858..12a6b2f98 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -20,8 +20,8 @@ use std::fmt;
 use std::{any::Any, iter::FromIterator};
 
 use super::{
-    array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray,
-    GenericStringIter, OffsetSizeTrait,
+    array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
+    GenericBinaryArray, GenericListArray, GenericStringIter, OffsetSizeTrait,
 };
 use crate::array::array::ArrayAccessor;
 use crate::buffer::Buffer;
@@ -313,6 +313,27 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor
     }
 }
 
+impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
+    for GenericStringArray<OffsetSize>
+{
+    fn from(v: GenericListArray<OffsetSize>) -> Self {
+        GenericStringArray::<OffsetSize>::from_list(v)
+    }
+}
+
+impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
+    for GenericStringArray<OffsetSize>
+{
+    fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
+        let builder = v
+            .into_data()
+            .into_builder()
+            .data_type(Self::get_data_type());
+        let data = unsafe { builder.build_unchecked() };
+        Self::from(data)
+    }
+}
+
 impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericStringArray<OffsetSize> {
     fn from(data: ArrayData) -> Self {
         assert_eq!(
@@ -385,12 +406,6 @@ pub type StringArray = GenericStringArray<i32>;
 /// ```
 pub type LargeStringArray = GenericStringArray<i64>;
 
-impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<T> {
-    fn from(v: GenericListArray<T>) -> Self {
-        GenericStringArray::<T>::from_list(v)
-    }
-}
-
 #[cfg(test)]
 mod tests {
 
diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs
index 1e29c470c..8f242243c 100644
--- a/arrow/src/array/builder/generic_binary_builder.rs
+++ b/arrow/src/array/builder/generic_binary_builder.rs
@@ -48,6 +48,19 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
         }
     }
 
+    /// Creates a new [`GenericBinaryBuilder`],
+    /// `item_capacity` is the number of items to pre-allocate space for in this builder
+    /// `data_capacity` is the number of bytes to pre-allocate space for in this builder
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+        let mut offsets_builder = BufferBuilder::<OffsetSize>::new(item_capacity + 1);
+        offsets_builder.append(OffsetSize::zero());
+        Self {
+            value_builder: UInt8BufferBuilder::new(data_capacity),
+            offsets_builder,
+            null_buffer_builder: NullBufferBuilder::new(item_capacity),
+        }
+    }
+
     /// Appends a byte slice into the builder.
     #[inline]
     pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
@@ -82,6 +95,16 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
         let array_data = unsafe { array_builder.build_unchecked() };
         GenericBinaryArray::<OffsetSize>::from(array_data)
     }
+
+    /// Returns the current values buffer as a slice
+    pub fn values_slice(&self) -> &[u8] {
+        self.value_builder.as_slice()
+    }
+
+    /// Returns the current offsets buffer as a slice
+    pub fn offsets_slice(&self) -> &[OffsetSize] {
+        self.offsets_builder.as_slice()
+    }
 }
 
 impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs
index d44aed44a..02c34bdd3 100644
--- a/arrow/src/array/builder/generic_string_builder.rs
+++ b/arrow/src/array/builder/generic_string_builder.rs
@@ -15,60 +15,46 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::array::{
-    ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringArray, OffsetSizeTrait,
-    UInt8Builder,
-};
+use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait};
 use std::any::Any;
 use std::sync::Arc;
 
+use super::GenericBinaryBuilder;
+
+///  Array builder for [`GenericStringArray`]
 #[derive(Debug)]
 pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
-    builder: GenericListBuilder<OffsetSize, UInt8Builder>,
+    builder: GenericBinaryBuilder<OffsetSize>,
 }
 
 impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
-    /// Creates a new `StringBuilder`,
+    /// Creates a new [`GenericStringBuilder`],
     /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder
     pub fn new(capacity: usize) -> Self {
-        let values_builder = UInt8Builder::new(capacity);
         Self {
-            builder: GenericListBuilder::new(values_builder),
+            builder: GenericBinaryBuilder::new(capacity),
         }
     }
 
-    /// Creates a new `StringBuilder`,
+    /// Creates a new [`GenericStringBuilder`],
     /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder
     /// `item_capacity` is the number of items to pre-allocate space for in this builder
     pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        let values_builder = UInt8Builder::new(data_capacity);
         Self {
-            builder: GenericListBuilder::with_capacity(values_builder, item_capacity),
+            builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity),
         }
     }
 
     /// Appends a string into the builder.
-    ///
-    /// Automatically calls the `append` method to delimit the string appended in as a
-    /// distinct array element.
     #[inline]
     pub fn append_value(&mut self, value: impl AsRef<str>) {
-        self.builder
-            .values()
-            .append_slice(value.as_ref().as_bytes());
-        self.builder.append(true);
-    }
-
-    /// Finish the current variable-length list array slot.
-    #[inline]
-    pub fn append(&mut self, is_valid: bool) {
-        self.builder.append(is_valid)
+        self.builder.append_value(value.as_ref().as_bytes());
     }
 
     /// Append a null value to the array.
     #[inline]
     pub fn append_null(&mut self) {
-        self.append(false)
+        self.builder.append_null()
     }
 
     /// Append an `Option` value to the array.
@@ -80,14 +66,14 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
         };
     }
 
-    /// Builds the `StringArray` and reset this builder.
+    /// Builds the [`GenericStringArray`] and reset this builder.
     pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
         GenericStringArray::<OffsetSize>::from(self.builder.finish())
     }
 
     /// Returns the current values buffer as a slice
     pub fn values_slice(&self) -> &[u8] {
-        self.builder.values_ref().values_slice()
+        self.builder.values_slice()
     }
 
     /// Returns the current offsets buffer as a slice
@@ -131,79 +117,72 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSi
 
 #[cfg(test)]
 mod tests {
-    use crate::array::builder::StringBuilder;
-    use crate::array::{Array, ArrayBuilder};
+    use super::*;
+    use crate::array::{Array, ArrayBuilder, OffsetSizeTrait};
 
-    #[test]
-    fn test_string_array_builder() {
-        let mut builder = StringBuilder::new(20);
+    fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
+        let mut builder = GenericStringBuilder::<O>::new(20);
+        let owned = "arrow".to_owned();
 
         builder.append_value("hello");
-        builder.append(true);
-        builder.append_value("world");
-
-        let string_array = builder.finish();
+        builder.append_value("");
+        builder.append_value(&owned);
+        builder.append_null();
+        builder.append_option(Some("rust"));
+        builder.append_option(None::<&str>);
+        builder.append_option(None::<String>);
+        assert_eq!(7, builder.len());
+
+        assert_eq!(
+            GenericStringArray::<O>::from(vec![
+                Some("hello"),
+                Some(""),
+                Some("arrow"),
+                None,
+                Some("rust"),
+                None,
+                None
+            ]),
+            builder.finish()
+        );
+    }
 
-        assert_eq!(3, string_array.len());
-        assert_eq!(0, string_array.null_count());
-        assert_eq!("hello", string_array.value(0));
-        assert_eq!("", string_array.value(1));
-        assert_eq!("world", string_array.value(2));
-        assert_eq!(5, string_array.value_offsets()[2]);
-        assert_eq!(5, string_array.value_length(2));
+    #[test]
+    fn test_string_array_builder() {
+        _test_generic_string_array_builder::<i32>()
     }
 
     #[test]
-    fn test_string_array_builder_finish() {
-        let mut builder = StringBuilder::new(10);
+    fn test_large_string_array_builder() {
+        _test_generic_string_array_builder::<i64>()
+    }
+
+    fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
+        let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
 
         builder.append_value("hello");
-        builder.append_value("world");
+        builder.append_value("rust");
+        builder.append_null();
 
-        let mut arr = builder.finish();
-        assert_eq!(2, arr.len());
-        assert_eq!(0, builder.len());
+        builder.finish();
+        assert!(builder.is_empty());
+        assert_eq!(&[O::zero()], builder.offsets_slice());
 
         builder.append_value("arrow");
-        arr = builder.finish();
-        assert_eq!(1, arr.len());
-        assert_eq!(0, builder.len());
+        builder.append_value("parquet");
+        let arr = builder.finish();
+        // array should not have null buffer because there is not `null` value.
+        assert_eq!(None, arr.data().null_buffer());
+        assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
     }
 
     #[test]
-    fn test_string_array_builder_append_string() {
-        let mut builder = StringBuilder::new(20);
-
-        let var = "hello".to_owned();
-        builder.append_value(&var);
-        builder.append(true);
-        builder.append_value("world");
-
-        let string_array = builder.finish();
-
-        assert_eq!(3, string_array.len());
-        assert_eq!(0, string_array.null_count());
-        assert_eq!("hello", string_array.value(0));
-        assert_eq!("", string_array.value(1));
-        assert_eq!("world", string_array.value(2));
-        assert_eq!(5, string_array.value_offsets()[2]);
-        assert_eq!(5, string_array.value_length(2));
+    fn test_string_array_builder_finish() {
+        _test_generic_string_array_builder_finish::<i32>()
     }
 
     #[test]
-    fn test_string_array_builder_append_option() {
-        let mut builder = StringBuilder::new(20);
-        builder.append_option(Some("hello"));
-        builder.append_option(None::<&str>);
-        builder.append_option(None::<String>);
-        builder.append_option(Some("world"));
-
-        let string_array = builder.finish();
-
-        assert_eq!(4, string_array.len());
-        assert_eq!("hello", string_array.value(0));
-        assert!(string_array.is_null(1));
-        assert!(string_array.is_null(2));
-        assert_eq!("world", string_array.value(3));
+    fn test_large_string_array_builder_finish() {
+        _test_generic_string_array_builder_finish::<i64>()
     }
 }