You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/28 09:56:47 UTC
[arrow-rs] branch master updated: Let the `StringBuilder` use `BinaryBuilder` (#2181)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9e47779e3 Let the `StringBuilder` use `BinaryBuilder` (#2181)
9e47779e3 is described below
commit 9e47779e3f4667e676d00db108e725d576930fcc
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Thu Jul 28 17:56:41 2022 +0800
Let the `StringBuilder` use `BinaryBuilder` (#2181)
* update string builder
Signed-off-by: remzi <13...@gmail.com>
* update tests and fmt
Signed-off-by: remzi <13...@gmail.com>
---
arrow/src/array/array_string.rs | 31 +++--
arrow/src/array/builder/generic_binary_builder.rs | 23 ++++
arrow/src/array/builder/generic_string_builder.rs | 145 +++++++++-------------
3 files changed, 108 insertions(+), 91 deletions(-)
diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index df858d858..12a6b2f98 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -20,8 +20,8 @@ use std::fmt;
use std::{any::Any, iter::FromIterator};
use super::{
- array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray,
- GenericStringIter, OffsetSizeTrait,
+ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
+ GenericBinaryArray, GenericListArray, GenericStringIter, OffsetSizeTrait,
};
use crate::array::array::ArrayAccessor;
use crate::buffer::Buffer;
@@ -313,6 +313,27 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor
}
}
+impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
+ for GenericStringArray<OffsetSize>
+{
+ fn from(v: GenericListArray<OffsetSize>) -> Self {
+ GenericStringArray::<OffsetSize>::from_list(v)
+ }
+}
+
+impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
+ for GenericStringArray<OffsetSize>
+{
+ fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
+ let builder = v
+ .into_data()
+ .into_builder()
+ .data_type(Self::get_data_type());
+ let data = unsafe { builder.build_unchecked() };
+ Self::from(data)
+ }
+}
+
impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericStringArray<OffsetSize> {
fn from(data: ArrayData) -> Self {
assert_eq!(
@@ -385,12 +406,6 @@ pub type StringArray = GenericStringArray<i32>;
/// ```
pub type LargeStringArray = GenericStringArray<i64>;
-impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<T> {
- fn from(v: GenericListArray<T>) -> Self {
- GenericStringArray::<T>::from_list(v)
- }
-}
-
#[cfg(test)]
mod tests {
diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs
index 1e29c470c..8f242243c 100644
--- a/arrow/src/array/builder/generic_binary_builder.rs
+++ b/arrow/src/array/builder/generic_binary_builder.rs
@@ -48,6 +48,19 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
}
}
+ /// Creates a new [`GenericBinaryBuilder`],
+ /// `item_capacity` is the number of items to pre-allocate space for in this builder
+ /// `data_capacity` is the number of bytes to pre-allocate space for in this builder
+ pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let mut offsets_builder = BufferBuilder::<OffsetSize>::new(item_capacity + 1);
+ offsets_builder.append(OffsetSize::zero());
+ Self {
+ value_builder: UInt8BufferBuilder::new(data_capacity),
+ offsets_builder,
+ null_buffer_builder: NullBufferBuilder::new(item_capacity),
+ }
+ }
+
/// Appends a byte slice into the builder.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
@@ -82,6 +95,16 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
let array_data = unsafe { array_builder.build_unchecked() };
GenericBinaryArray::<OffsetSize>::from(array_data)
}
+
+ /// Returns the current values buffer as a slice
+ pub fn values_slice(&self) -> &[u8] {
+ self.value_builder.as_slice()
+ }
+
+ /// Returns the current offsets buffer as a slice
+ pub fn offsets_slice(&self) -> &[OffsetSize] {
+ self.offsets_builder.as_slice()
+ }
}
impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs
index d44aed44a..02c34bdd3 100644
--- a/arrow/src/array/builder/generic_string_builder.rs
+++ b/arrow/src/array/builder/generic_string_builder.rs
@@ -15,60 +15,46 @@
// specific language governing permissions and limitations
// under the License.
-use crate::array::{
- ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringArray, OffsetSizeTrait,
- UInt8Builder,
-};
+use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait};
use std::any::Any;
use std::sync::Arc;
+use super::GenericBinaryBuilder;
+
+/// Array builder for [`GenericStringArray`]
#[derive(Debug)]
pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
- builder: GenericListBuilder<OffsetSize, UInt8Builder>,
+ builder: GenericBinaryBuilder<OffsetSize>,
}
impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
- /// Creates a new `StringBuilder`,
+ /// Creates a new [`GenericStringBuilder`],
/// `capacity` is the number of bytes of string data to pre-allocate space for in this builder
pub fn new(capacity: usize) -> Self {
- let values_builder = UInt8Builder::new(capacity);
Self {
- builder: GenericListBuilder::new(values_builder),
+ builder: GenericBinaryBuilder::new(capacity),
}
}
- /// Creates a new `StringBuilder`,
+ /// Creates a new [`GenericStringBuilder`],
/// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder
/// `item_capacity` is the number of items to pre-allocate space for in this builder
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
- let values_builder = UInt8Builder::new(data_capacity);
Self {
- builder: GenericListBuilder::with_capacity(values_builder, item_capacity),
+ builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity),
}
}
/// Appends a string into the builder.
- ///
- /// Automatically calls the `append` method to delimit the string appended in as a
- /// distinct array element.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<str>) {
- self.builder
- .values()
- .append_slice(value.as_ref().as_bytes());
- self.builder.append(true);
- }
-
- /// Finish the current variable-length list array slot.
- #[inline]
- pub fn append(&mut self, is_valid: bool) {
- self.builder.append(is_valid)
+ self.builder.append_value(value.as_ref().as_bytes());
}
/// Append a null value to the array.
#[inline]
pub fn append_null(&mut self) {
- self.append(false)
+ self.builder.append_null()
}
/// Append an `Option` value to the array.
@@ -80,14 +66,14 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
};
}
- /// Builds the `StringArray` and reset this builder.
+ /// Builds the [`GenericStringArray`] and reset this builder.
pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
GenericStringArray::<OffsetSize>::from(self.builder.finish())
}
/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
- self.builder.values_ref().values_slice()
+ self.builder.values_slice()
}
/// Returns the current offsets buffer as a slice
@@ -131,79 +117,72 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSi
#[cfg(test)]
mod tests {
- use crate::array::builder::StringBuilder;
- use crate::array::{Array, ArrayBuilder};
+ use super::*;
+ use crate::array::{Array, ArrayBuilder, OffsetSizeTrait};
- #[test]
- fn test_string_array_builder() {
- let mut builder = StringBuilder::new(20);
+ fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
+ let mut builder = GenericStringBuilder::<O>::new(20);
+ let owned = "arrow".to_owned();
builder.append_value("hello");
- builder.append(true);
- builder.append_value("world");
-
- let string_array = builder.finish();
+ builder.append_value("");
+ builder.append_value(&owned);
+ builder.append_null();
+ builder.append_option(Some("rust"));
+ builder.append_option(None::<&str>);
+ builder.append_option(None::<String>);
+ assert_eq!(7, builder.len());
+
+ assert_eq!(
+ GenericStringArray::<O>::from(vec![
+ Some("hello"),
+ Some(""),
+ Some("arrow"),
+ None,
+ Some("rust"),
+ None,
+ None
+ ]),
+ builder.finish()
+ );
+ }
- assert_eq!(3, string_array.len());
- assert_eq!(0, string_array.null_count());
- assert_eq!("hello", string_array.value(0));
- assert_eq!("", string_array.value(1));
- assert_eq!("world", string_array.value(2));
- assert_eq!(5, string_array.value_offsets()[2]);
- assert_eq!(5, string_array.value_length(2));
+ #[test]
+ fn test_string_array_builder() {
+ _test_generic_string_array_builder::<i32>()
}
#[test]
- fn test_string_array_builder_finish() {
- let mut builder = StringBuilder::new(10);
+ fn test_large_string_array_builder() {
+ _test_generic_string_array_builder::<i64>()
+ }
+
+ fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
+ let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
builder.append_value("hello");
- builder.append_value("world");
+ builder.append_value("rust");
+ builder.append_null();
- let mut arr = builder.finish();
- assert_eq!(2, arr.len());
- assert_eq!(0, builder.len());
+ builder.finish();
+ assert!(builder.is_empty());
+ assert_eq!(&[O::zero()], builder.offsets_slice());
builder.append_value("arrow");
- arr = builder.finish();
- assert_eq!(1, arr.len());
- assert_eq!(0, builder.len());
+ builder.append_value("parquet");
+ let arr = builder.finish();
+ // array should not have null buffer because there is not `null` value.
+ assert_eq!(None, arr.data().null_buffer());
+ assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
}
#[test]
- fn test_string_array_builder_append_string() {
- let mut builder = StringBuilder::new(20);
-
- let var = "hello".to_owned();
- builder.append_value(&var);
- builder.append(true);
- builder.append_value("world");
-
- let string_array = builder.finish();
-
- assert_eq!(3, string_array.len());
- assert_eq!(0, string_array.null_count());
- assert_eq!("hello", string_array.value(0));
- assert_eq!("", string_array.value(1));
- assert_eq!("world", string_array.value(2));
- assert_eq!(5, string_array.value_offsets()[2]);
- assert_eq!(5, string_array.value_length(2));
+ fn test_string_array_builder_finish() {
+ _test_generic_string_array_builder_finish::<i32>()
}
#[test]
- fn test_string_array_builder_append_option() {
- let mut builder = StringBuilder::new(20);
- builder.append_option(Some("hello"));
- builder.append_option(None::<&str>);
- builder.append_option(None::<String>);
- builder.append_option(Some("world"));
-
- let string_array = builder.finish();
-
- assert_eq!(4, string_array.len());
- assert_eq!("hello", string_array.value(0));
- assert!(string_array.is_null(1));
- assert!(string_array.is_null(2));
- assert_eq!("world", string_array.value(3));
+ fn test_large_string_array_builder_finish() {
+ _test_generic_string_array_builder_finish::<i64>()
}
}