You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/21 19:03:41 UTC
[arrow-rs] branch master updated: Add GenericByteBuilder (#2969) (#3122)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new b3dbe7011 Add GenericByteBuilder (#2969) (#3122)
b3dbe7011 is described below
commit b3dbe7011ace86c7ab46b1a15388f128bffb5f6d
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Mon Nov 21 19:03:35 2022 +0000
Add GenericByteBuilder (#2969) (#3122)
* Add GenericByteBuilder (#2969)
* RAT
* Apply suggestions from code review
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
..._binary_builder.rs => generic_bytes_builder.rs} | 171 +++++++++++++----
arrow-array/src/builder/generic_string_builder.rs | 204 ---------------------
arrow-array/src/builder/mod.rs | 6 +-
3 files changed, 135 insertions(+), 246 deletions(-)
diff --git a/arrow-array/src/builder/generic_binary_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs
similarity index 58%
rename from arrow-array/src/builder/generic_binary_builder.rs
rename to arrow-array/src/builder/generic_bytes_builder.rs
index c806bebf9..fa0a31ad7 100644
--- a/arrow-array/src/builder/generic_binary_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_builder.rs
@@ -17,34 +17,35 @@
use crate::builder::null_buffer_builder::NullBufferBuilder;
use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
-use crate::{ArrayRef, GenericBinaryArray, OffsetSizeTrait};
+use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
+use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
+use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayDataBuilder;
use std::any::Any;
use std::sync::Arc;
-/// Array builder for [`GenericBinaryArray`]
-#[derive(Debug)]
-pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
+/// Array builder for [`GenericByteArray`]
+pub struct GenericByteBuilder<T: ByteArrayType> {
value_builder: UInt8BufferBuilder,
- offsets_builder: BufferBuilder<OffsetSize>,
+ offsets_builder: BufferBuilder<T::Offset>,
null_buffer_builder: NullBufferBuilder,
}
-impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
- /// Creates a new [`GenericBinaryBuilder`].
+impl<T: ByteArrayType> GenericByteBuilder<T> {
+ /// Creates a new [`GenericByteBuilder`].
pub fn new() -> Self {
Self::with_capacity(1024, 1024)
}
- /// Creates a new [`GenericBinaryBuilder`].
+ /// Creates a new [`GenericByteBuilder`].
///
/// - `item_capacity` is the number of items to pre-allocate.
/// The size of the preallocated buffer of offsets is the number of items plus one.
- /// - `data_capacity` is the total number of bytes of string data to pre-allocate
+ /// - `data_capacity` is the total number of bytes of data to pre-allocate
/// (for all items, not per item).
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
- let mut offsets_builder = BufferBuilder::<OffsetSize>::new(item_capacity + 1);
- offsets_builder.append(OffsetSize::zero());
+ let mut offsets_builder = BufferBuilder::<T::Offset>::new(item_capacity + 1);
+ offsets_builder.append(T::Offset::from_usize(0).unwrap());
Self {
value_builder: UInt8BufferBuilder::new(data_capacity),
offsets_builder,
@@ -52,13 +53,22 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
}
}
- /// Appends a byte slice into the builder.
+ /// Appends a value into the builder.
#[inline]
- pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
- self.value_builder.append_slice(value.as_ref());
+ pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
+ self.value_builder.append_slice(value.as_ref().as_ref());
self.null_buffer_builder.append(true);
self.offsets_builder
- .append(OffsetSize::from_usize(self.value_builder.len()).unwrap());
+ .append(T::Offset::from_usize(self.value_builder.len()).unwrap());
+ }
+
+ /// Append an `Option` value into the builder.
+ #[inline]
+ pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
+ match value {
+ None => self.append_null(),
+ Some(v) => self.append_value(v),
+ };
}
/// Append a null value into the builder.
@@ -66,21 +76,22 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
pub fn append_null(&mut self) {
self.null_buffer_builder.append(false);
self.offsets_builder
- .append(OffsetSize::from_usize(self.value_builder.len()).unwrap());
+ .append(T::Offset::from_usize(self.value_builder.len()).unwrap());
}
- /// Builds the [`GenericBinaryArray`] and reset this builder.
- pub fn finish(&mut self) -> GenericBinaryArray<OffsetSize> {
- let array_type = GenericBinaryArray::<OffsetSize>::DATA_TYPE;
+ /// Builds the [`GenericByteArray`] and reset this builder.
+ pub fn finish(&mut self) -> GenericByteArray<T> {
+ let array_type = T::DATA_TYPE;
let array_builder = ArrayDataBuilder::new(array_type)
.len(self.len())
.add_buffer(self.offsets_builder.finish())
.add_buffer(self.value_builder.finish())
.null_bit_buffer(self.null_buffer_builder.finish());
- self.offsets_builder.append(OffsetSize::zero());
+ self.offsets_builder
+ .append(T::Offset::from_usize(0).unwrap());
let array_data = unsafe { array_builder.build_unchecked() };
- GenericBinaryArray::<OffsetSize>::from(array_data)
+ GenericByteArray::from(array_data)
}
/// Returns the current values buffer as a slice
@@ -89,18 +100,44 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
}
/// Returns the current offsets buffer as a slice
- pub fn offsets_slice(&self) -> &[OffsetSize] {
+ pub fn offsets_slice(&self) -> &[T::Offset] {
self.offsets_builder.as_slice()
}
}
-impl<OffsetSize: OffsetSizeTrait> Default for GenericBinaryBuilder<OffsetSize> {
+impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
+ f.debug_struct("")
+ .field("value_builder", &self.value_builder)
+ .field("offsets_builder", &self.offsets_builder)
+ .field("null_buffer_builder", &self.null_buffer_builder)
+ .finish()
+ }
+}
+
+impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
fn default() -> Self {
Self::new()
}
}
-impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
+impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
+ /// Returns the number of binary slots in the builder
+ fn len(&self) -> usize {
+ self.null_buffer_builder.len()
+ }
+
+ /// Returns whether the number of binary slots is zero
+ fn is_empty(&self) -> bool {
+ self.null_buffer_builder.is_empty()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
@@ -115,27 +152,19 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSi
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
+}
- /// Returns the number of binary slots in the builder
- fn len(&self) -> usize {
- self.null_buffer_builder.len()
- }
-
- /// Returns whether the number of binary slots is zero
- fn is_empty(&self) -> bool {
- self.null_buffer_builder.is_empty()
- }
+/// Array builder for [`GenericStringArray`][crate::GenericStringArray]
+pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
- /// Builds the array and reset this builder.
- fn finish(&mut self) -> ArrayRef {
- Arc::new(self.finish())
- }
-}
+/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
+pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
#[cfg(test)]
mod tests {
use super::*;
use crate::array::{Array, OffsetSizeTrait};
+ use crate::GenericStringArray;
fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
@@ -230,4 +259,70 @@ mod tests {
fn test_large_binary_builder_reset() {
_test_generic_binary_builder_reset::<i64>()
}
+
+ fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
+ let mut builder = GenericStringBuilder::<O>::new();
+ let owned = "arrow".to_owned();
+
+ builder.append_value("hello");
+ builder.append_value("");
+ builder.append_value(&owned);
+ builder.append_null();
+ builder.append_option(Some("rust"));
+ builder.append_option(None::<&str>);
+ builder.append_option(None::<String>);
+ assert_eq!(7, builder.len());
+
+ assert_eq!(
+ GenericStringArray::<O>::from(vec![
+ Some("hello"),
+ Some(""),
+ Some("arrow"),
+ None,
+ Some("rust"),
+ None,
+ None
+ ]),
+ builder.finish()
+ );
+ }
+
+ #[test]
+ fn test_string_array_builder() {
+ _test_generic_string_array_builder::<i32>()
+ }
+
+ #[test]
+ fn test_large_string_array_builder() {
+ _test_generic_string_array_builder::<i64>()
+ }
+
+ fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
+ let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
+
+ builder.append_value("hello");
+ builder.append_value("rust");
+ builder.append_null();
+
+ builder.finish();
+ assert!(builder.is_empty());
+ assert_eq!(&[O::zero()], builder.offsets_slice());
+
+ builder.append_value("arrow");
+ builder.append_value("parquet");
+ let arr = builder.finish();
+ // array should not have null buffer because there is not `null` value.
+ assert_eq!(None, arr.data().null_buffer());
+ assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
+ }
+
+ #[test]
+ fn test_string_array_builder_finish() {
+ _test_generic_string_array_builder_finish::<i32>()
+ }
+
+ #[test]
+ fn test_large_string_array_builder_finish() {
+ _test_generic_string_array_builder_finish::<i64>()
+ }
}
diff --git a/arrow-array/src/builder/generic_string_builder.rs b/arrow-array/src/builder/generic_string_builder.rs
deleted file mode 100644
index f766b6f55..000000000
--- a/arrow-array/src/builder/generic_string_builder.rs
+++ /dev/null
@@ -1,204 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use crate::builder::{ArrayBuilder, GenericBinaryBuilder};
-use crate::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
-use std::any::Any;
-use std::sync::Arc;
-
-/// Array builder for [`GenericStringArray`]
-#[derive(Debug)]
-pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
- builder: GenericBinaryBuilder<OffsetSize>,
-}
-
-impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
- /// Creates a new [`GenericStringBuilder`].
- pub fn new() -> Self {
- Self {
- builder: GenericBinaryBuilder::new(),
- }
- }
-
- /// Creates a new [`GenericStringBuilder`].
- ///
- /// - `item_capacity` is the number of items to pre-allocate.
- /// The size of the preallocated buffer of offsets is the number of items plus one.
- /// - `data_capacity` is the total number of bytes of string data to pre-allocate
- /// (for all items, not per item).
- pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
- Self {
- builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity),
- }
- }
-
- /// Appends a string into the builder.
- #[inline]
- pub fn append_value(&mut self, value: impl AsRef<str>) {
- self.builder.append_value(value.as_ref().as_bytes());
- }
-
- /// Append a null value into the builder.
- #[inline]
- pub fn append_null(&mut self) {
- self.builder.append_null()
- }
-
- /// Append an `Option` value into the builder.
- #[inline]
- pub fn append_option(&mut self, value: Option<impl AsRef<str>>) {
- match value {
- None => self.append_null(),
- Some(v) => self.append_value(v),
- };
- }
-
- /// Builds the [`GenericStringArray`] and reset this builder.
- pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
- let t = GenericStringArray::<OffsetSize>::DATA_TYPE;
- let v = self.builder.finish();
- let builder = v.into_data().into_builder().data_type(t);
-
- // SAFETY:
- // Data must be UTF-8 as only support writing `str`
- // Offsets must be valid as guaranteed by `GenericBinaryBuilder`
- let data = unsafe { builder.build_unchecked() };
- data.into()
- }
-
- /// Returns the current values buffer as a slice.
- pub fn values_slice(&self) -> &[u8] {
- self.builder.values_slice()
- }
-
- /// Returns the current offsets buffer as a slice.
- pub fn offsets_slice(&self) -> &[OffsetSize] {
- self.builder.offsets_slice()
- }
-}
-
-impl<OffsetSize: OffsetSizeTrait> Default for GenericStringBuilder<OffsetSize> {
- fn default() -> Self {
- Self::new()
- }
-}
-
-impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSize> {
- /// Returns the builder as a non-mutable `Any` reference.
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- /// Returns the builder as a mutable `Any` reference.
- fn as_any_mut(&mut self) -> &mut dyn Any {
- self
- }
-
- /// Returns the boxed builder as a box of `Any`.
- fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
- self
- }
-
- /// Returns the number of array slots in the builder
- fn len(&self) -> usize {
- self.builder.len()
- }
-
- /// Returns whether the number of array slots is zero
- fn is_empty(&self) -> bool {
- self.builder.is_empty()
- }
-
- /// Builds the array and reset this builder.
- fn finish(&mut self) -> ArrayRef {
- let a = GenericStringBuilder::<OffsetSize>::finish(self);
- Arc::new(a)
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use crate::array::{Array, OffsetSizeTrait};
- use crate::builder::ArrayBuilder;
-
- fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
- let mut builder = GenericStringBuilder::<O>::new();
- let owned = "arrow".to_owned();
-
- builder.append_value("hello");
- builder.append_value("");
- builder.append_value(&owned);
- builder.append_null();
- builder.append_option(Some("rust"));
- builder.append_option(None::<&str>);
- builder.append_option(None::<String>);
- assert_eq!(7, builder.len());
-
- assert_eq!(
- GenericStringArray::<O>::from(vec![
- Some("hello"),
- Some(""),
- Some("arrow"),
- None,
- Some("rust"),
- None,
- None
- ]),
- builder.finish()
- );
- }
-
- #[test]
- fn test_string_array_builder() {
- _test_generic_string_array_builder::<i32>()
- }
-
- #[test]
- fn test_large_string_array_builder() {
- _test_generic_string_array_builder::<i64>()
- }
-
- fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
- let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
-
- builder.append_value("hello");
- builder.append_value("rust");
- builder.append_null();
-
- builder.finish();
- assert!(builder.is_empty());
- assert_eq!(&[O::zero()], builder.offsets_slice());
-
- builder.append_value("arrow");
- builder.append_value("parquet");
- let arr = builder.finish();
- // array should not have null buffer because there is not `null` value.
- assert_eq!(None, arr.data().null_buffer());
- assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
- }
-
- #[test]
- fn test_string_array_builder_finish() {
- _test_generic_string_array_builder_finish::<i32>()
- }
-
- #[test]
- fn test_large_string_array_builder_finish() {
- _test_generic_string_array_builder_finish::<i64>()
- }
-}
diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs
index 5edf011d7..a5c1e3d4b 100644
--- a/arrow-array/src/builder/mod.rs
+++ b/arrow-array/src/builder/mod.rs
@@ -28,12 +28,10 @@ mod fixed_size_binary_builder;
pub use fixed_size_binary_builder::*;
mod fixed_size_list_builder;
pub use fixed_size_list_builder::*;
-mod generic_binary_builder;
-pub use generic_binary_builder::*;
+mod generic_bytes_builder;
+pub use generic_bytes_builder::*;
mod generic_list_builder;
pub use generic_list_builder::*;
-mod generic_string_builder;
-pub use generic_string_builder::*;
mod map_builder;
pub use map_builder::*;
mod null_buffer_builder;