You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/23 18:39:02 UTC
[arrow-rs] branch master updated: Complete and fixup split of `arrow::array::builder` module (#1843) (#1928)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 041e5101f Complete and fixup split of `arrow::array::builder` module (#1843) (#1928)
041e5101f is described below
commit 041e5101fa64d638ee6a7fb3878c52503838c349
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Jun 23 19:38:56 2022 +0100
Complete and fixup split of `arrow::array::builder` module (#1843) (#1928)
* Fix merge conflicts from (#1879)
* Split out of decimal_builder (#1843)
* Fix RAT
* Format
* Restore (#1842)
---
arrow/src/array/builder/buffer_builder.rs | 116 ++++++--
arrow/src/array/builder/decimal_builder.rs | 318 +++------------------
.../src/array/builder/fixed_size_binary_builder.rs | 99 +++++++
arrow/src/array/builder/generic_binary_builder.rs | 111 +++++++
arrow/src/array/builder/generic_string_builder.rs | 123 ++++++++
arrow/src/array/builder/mod.rs | 31 +-
arrow/src/array/builder/union_builder.rs | 142 ++++-----
7 files changed, 512 insertions(+), 428 deletions(-)
diff --git a/arrow/src/array/builder/buffer_builder.rs b/arrow/src/array/builder/buffer_builder.rs
index 83b2afb44..9dd138398 100644
--- a/arrow/src/array/builder/buffer_builder.rs
+++ b/arrow/src/array/builder/buffer_builder.rs
@@ -22,29 +22,6 @@ use crate::datatypes::ArrowNativeType;
use super::PhantomData;
-/// Converts a `MutableBuffer` to a `BufferBuilder<T>`.
-///
-/// `slots` is the number of array slots currently represented in the `MutableBuffer`.
-pub(crate) fn mutable_buffer_to_builder<T: ArrowNativeType>(
- mutable_buffer: MutableBuffer,
- slots: usize,
-) -> BufferBuilder<T> {
- BufferBuilder::<T> {
- buffer: mutable_buffer,
- len: slots,
- _marker: PhantomData,
- }
-}
-
-/// Converts a `BufferBuilder<T>` into its underlying `MutableBuffer`.
-///
-/// `From` is not implemented because associated type bounds are unstable.
-pub(crate) fn builder_to_mutable_buffer<T: ArrowNativeType>(
- builder: BufferBuilder<T>,
-) -> MutableBuffer {
- builder.buffer
-}
-
/// Builder for creating a [`Buffer`](crate::buffer::Buffer) object.
///
/// A [`Buffer`](crate::buffer::Buffer) is the underlying data
@@ -168,8 +145,7 @@ impl<T: ArrowNativeType> BufferBuilder<T> {
/// ```
#[inline]
pub fn advance(&mut self, i: usize) {
- let new_buffer_len = (self.len + i) * mem::size_of::<T>();
- self.buffer.resize(new_buffer_len, 0);
+ self.buffer.extend_zeros(i * mem::size_of::<T>());
self.len += i;
}
@@ -232,6 +208,24 @@ impl<T: ArrowNativeType> BufferBuilder<T> {
self.len += n;
}
+ /// Appends `n`, zero-initialized values
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::UInt32BufferBuilder;
+ ///
+ /// let mut builder = UInt32BufferBuilder::new(10);
+ /// builder.append_n_zeroed(3);
+ ///
+ /// assert_eq!(builder.len(), 3);
+ /// assert_eq!(builder.as_slice(), &[0, 0, 0])
+ #[inline]
+ pub fn append_n_zeroed(&mut self, n: usize) {
+ self.buffer.extend_zeros(n * mem::size_of::<T>());
+ self.len += n;
+ }
+
/// Appends a slice of type `T`, growing the internal buffer as needed.
///
/// # Example:
@@ -250,6 +244,78 @@ impl<T: ArrowNativeType> BufferBuilder<T> {
self.len += slice.len();
}
+ /// View the contents of this buffer as a slice
+ ///
+ /// ```
+ /// use arrow::array::Float64BufferBuilder;
+ ///
+ /// let mut builder = Float64BufferBuilder::new(10);
+ /// builder.append(1.3);
+ /// builder.append_n(2, 2.3);
+ ///
+ /// assert_eq!(builder.as_slice(), &[1.3, 2.3, 2.3]);
+ /// ```
+ #[inline]
+ pub fn as_slice(&self) -> &[T] {
+ // SAFETY
+ //
+ // - MutableBuffer is aligned and initialized for len elements of T
+ // - MutableBuffer corresponds to a single allocation
+ // - MutableBuffer does not support modification whilst active immutable borrows
+ unsafe { std::slice::from_raw_parts(self.buffer.as_ptr() as _, self.len) }
+ }
+
+ /// View the contents of this buffer as a mutable slice
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::Float32BufferBuilder;
+ ///
+ /// let mut builder = Float32BufferBuilder::new(10);
+ ///
+ /// builder.append_slice(&[1., 2., 3.4]);
+ /// assert_eq!(builder.as_slice(), &[1., 2., 3.4]);
+ ///
+ /// builder.as_slice_mut()[1] = 4.2;
+ /// assert_eq!(builder.as_slice(), &[1., 4.2, 3.4]);
+ /// ```
+ #[inline]
+ pub fn as_slice_mut(&mut self) -> &mut [T] {
+ // SAFETY
+ //
+ // - MutableBuffer is aligned and initialized for len elements of T
+ // - MutableBuffer corresponds to a single allocation
+ // - MutableBuffer does not support modification whilst active immutable borrows
+ unsafe { std::slice::from_raw_parts_mut(self.buffer.as_mut_ptr() as _, self.len) }
+ }
+
+ /// Shorten this BufferBuilder to `len` items
+ ///
+ /// If `len` is greater than the builder's current length, this has no effect
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::UInt16BufferBuilder;
+ ///
+ /// let mut builder = UInt16BufferBuilder::new(10);
+ ///
+ /// builder.append_slice(&[42, 44, 46]);
+ /// assert_eq!(builder.as_slice(), &[42, 44, 46]);
+ ///
+ /// builder.truncate(2);
+ /// assert_eq!(builder.as_slice(), &[42, 44]);
+ ///
+ /// builder.append(12);
+ /// assert_eq!(builder.as_slice(), &[42, 44, 12]);
+ /// ```
+ #[inline]
+ pub fn truncate(&mut self, len: usize) {
+ self.buffer.truncate(len * mem::size_of::<T>());
+ self.len = len;
+ }
+
/// # Safety
/// This requires the iterator be a trusted length. This could instead require
/// the iterator implement `TrustedLen` once that is stabilized.
diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs
index a7925358b..e7e9ec6a5 100644
--- a/arrow/src/array/builder/decimal_builder.rs
+++ b/arrow/src/array/builder/decimal_builder.rs
@@ -18,19 +18,13 @@
use std::any::Any;
use std::sync::Arc;
-use crate::array::ArrayBuilder;
use crate::array::ArrayRef;
use crate::array::DecimalArray;
-use crate::array::FixedSizeBinaryArray;
-use crate::array::OffsetSizeTrait;
use crate::array::UInt8Builder;
-use crate::array::{GenericBinaryArray, GenericStringArray};
+use crate::array::{ArrayBuilder, FixedSizeListBuilder};
use crate::error::{ArrowError, Result};
-use super::{FixedSizeBinaryBuilder, FixedSizeListBuilder};
-use super::{GenericBinaryBuilder, GenericListBuilder, GenericStringBuilder};
-
use crate::datatypes::validate_decimal_precision;
/// Array Builder for [`DecimalArray`]
@@ -48,284 +42,6 @@ pub struct DecimalBuilder {
value_validation: bool,
}
-impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
- /// Returns the builder as a non-mutable `Any` reference.
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- /// Returns the builder as a mutable `Any` reference.
- fn as_any_mut(&mut self) -> &mut dyn Any {
- self
- }
-
- /// Returns the boxed builder as a box of `Any`.
- fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
- self
- }
-
- /// Returns the number of array slots in the builder
- fn len(&self) -> usize {
- self.builder.len()
- }
-
- /// Returns whether the number of array slots is zero
- fn is_empty(&self) -> bool {
- self.builder.is_empty()
- }
-
- /// Builds the array and reset this builder.
- fn finish(&mut self) -> ArrayRef {
- Arc::new(self.finish())
- }
-}
-
-impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSize> {
- /// Returns the builder as a non-mutable `Any` reference.
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- /// Returns the builder as a mutable `Any` reference.
- fn as_any_mut(&mut self) -> &mut dyn Any {
- self
- }
-
- /// Returns the boxed builder as a box of `Any`.
- fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
- self
- }
-
- /// Returns the number of array slots in the builder
- fn len(&self) -> usize {
- self.builder.len()
- }
-
- /// Returns whether the number of array slots is zero
- fn is_empty(&self) -> bool {
- self.builder.is_empty()
- }
-
- /// Builds the array and reset this builder.
- fn finish(&mut self) -> ArrayRef {
- let a = GenericStringBuilder::<OffsetSize>::finish(self);
- Arc::new(a)
- }
-}
-
-impl ArrayBuilder for FixedSizeBinaryBuilder {
- /// Returns the builder as a non-mutable `Any` reference.
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- /// Returns the builder as a mutable `Any` reference.
- fn as_any_mut(&mut self) -> &mut dyn Any {
- self
- }
-
- /// Returns the boxed builder as a box of `Any`.
- fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
- self
- }
-
- /// Returns the number of array slots in the builder
- fn len(&self) -> usize {
- self.builder.len()
- }
-
- /// Returns whether the number of array slots is zero
- fn is_empty(&self) -> bool {
- self.builder.is_empty()
- }
-
- /// Builds the array and reset this builder.
- fn finish(&mut self) -> ArrayRef {
- Arc::new(self.finish())
- }
-}
-
-impl ArrayBuilder for DecimalBuilder {
- /// Returns the builder as a non-mutable `Any` reference.
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- /// Returns the builder as a mutable `Any` reference.
- fn as_any_mut(&mut self) -> &mut dyn Any {
- self
- }
-
- /// Returns the boxed builder as a box of `Any`.
- fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
- self
- }
-
- /// Returns the number of array slots in the builder
- fn len(&self) -> usize {
- self.builder.len()
- }
-
- /// Returns whether the number of array slots is zero
- fn is_empty(&self) -> bool {
- self.builder.is_empty()
- }
-
- /// Builds the array and reset this builder.
- fn finish(&mut self) -> ArrayRef {
- Arc::new(self.finish())
- }
-}
-
-impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
- /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values
- /// array
- pub fn new(capacity: usize) -> Self {
- let values_builder = UInt8Builder::new(capacity);
- Self {
- builder: GenericListBuilder::new(values_builder),
- }
- }
-
- /// Appends a single byte value into the builder's values array.
- ///
- /// Note, when appending individual byte values you must call `append` to delimit each
- /// distinct list value.
- #[inline]
- pub fn append_byte(&mut self, value: u8) -> Result<()> {
- self.builder.values().append_value(value)?;
- Ok(())
- }
-
- /// Appends a byte slice into the builder.
- ///
- /// Automatically calls the `append` method to delimit the slice appended in as a
- /// distinct array element.
- #[inline]
- pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> {
- self.builder.values().append_slice(value.as_ref())?;
- self.builder.append(true)?;
- Ok(())
- }
-
- /// Finish the current variable-length list array slot.
- #[inline]
- pub fn append(&mut self, is_valid: bool) -> Result<()> {
- self.builder.append(is_valid)
- }
-
- /// Append a null value to the array.
- #[inline]
- pub fn append_null(&mut self) -> Result<()> {
- self.append(false)
- }
-
- /// Builds the `BinaryArray` and reset this builder.
- pub fn finish(&mut self) -> GenericBinaryArray<OffsetSize> {
- GenericBinaryArray::<OffsetSize>::from(self.builder.finish())
- }
-}
-
-impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
- /// Creates a new `StringBuilder`,
- /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder
- pub fn new(capacity: usize) -> Self {
- let values_builder = UInt8Builder::new(capacity);
- Self {
- builder: GenericListBuilder::new(values_builder),
- }
- }
-
- /// Creates a new `StringBuilder`,
- /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder
- /// `item_capacity` is the number of items to pre-allocate space for in this builder
- pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
- let values_builder = UInt8Builder::new(data_capacity);
- Self {
- builder: GenericListBuilder::with_capacity(values_builder, item_capacity),
- }
- }
-
- /// Appends a string into the builder.
- ///
- /// Automatically calls the `append` method to delimit the string appended in as a
- /// distinct array element.
- #[inline]
- pub fn append_value(&mut self, value: impl AsRef<str>) -> Result<()> {
- self.builder
- .values()
- .append_slice(value.as_ref().as_bytes())?;
- self.builder.append(true)?;
- Ok(())
- }
-
- /// Finish the current variable-length list array slot.
- #[inline]
- pub fn append(&mut self, is_valid: bool) -> Result<()> {
- self.builder.append(is_valid)
- }
-
- /// Append a null value to the array.
- #[inline]
- pub fn append_null(&mut self) -> Result<()> {
- self.append(false)
- }
-
- /// Append an `Option` value to the array.
- #[inline]
- pub fn append_option(&mut self, value: Option<impl AsRef<str>>) -> Result<()> {
- match value {
- None => self.append_null()?,
- Some(v) => self.append_value(v)?,
- };
- Ok(())
- }
-
- /// Builds the `StringArray` and reset this builder.
- pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
- GenericStringArray::<OffsetSize>::from(self.builder.finish())
- }
-}
-
-impl FixedSizeBinaryBuilder {
- /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values
- /// array
- pub fn new(capacity: usize, byte_width: i32) -> Self {
- let values_builder = UInt8Builder::new(capacity);
- Self {
- builder: FixedSizeListBuilder::new(values_builder, byte_width),
- }
- }
-
- /// Appends a byte slice into the builder.
- ///
- /// Automatically calls the `append` method to delimit the slice appended in as a
- /// distinct array element.
- #[inline]
- pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> {
- if self.builder.value_length() != value.as_ref().len() as i32 {
- return Err(ArrowError::InvalidArgumentError(
- "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string()
- ));
- }
- self.builder.values().append_slice(value.as_ref())?;
- self.builder.append(true)
- }
-
- /// Append a null value to the array.
- #[inline]
- pub fn append_null(&mut self) -> Result<()> {
- let length: usize = self.builder.value_length() as usize;
- self.builder.values().append_slice(&vec![0u8; length][..])?;
- self.builder.append(false)
- }
-
- /// Builds the `FixedSizeBinaryArray` and reset this builder.
- pub fn finish(&mut self) -> FixedSizeBinaryArray {
- FixedSizeBinaryArray::from(self.builder.finish())
- }
-}
-
impl DecimalBuilder {
/// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values
/// array
@@ -406,6 +122,38 @@ impl DecimalBuilder {
}
}
+impl ArrayBuilder for DecimalBuilder {
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut dyn Any {
+ self
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.builder.len()
+ }
+
+ /// Returns whether the number of array slots is zero
+ fn is_empty(&self) -> bool {
+ self.builder.is_empty()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs b/arrow/src/array/builder/fixed_size_binary_builder.rs
new file mode 100644
index 000000000..1d40b4c5b
--- /dev/null
+++ b/arrow/src/array/builder/fixed_size_binary_builder.rs
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::array::{
+ ArrayBuilder, ArrayRef, FixedSizeBinaryArray, FixedSizeListBuilder, UInt8Builder,
+};
+use crate::error::{ArrowError, Result};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub struct FixedSizeBinaryBuilder {
+ builder: FixedSizeListBuilder<UInt8Builder>,
+}
+
+impl FixedSizeBinaryBuilder {
+ /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values
+ /// array
+ pub fn new(capacity: usize, byte_width: i32) -> Self {
+ let values_builder = UInt8Builder::new(capacity);
+ Self {
+ builder: FixedSizeListBuilder::new(values_builder, byte_width),
+ }
+ }
+
+ /// Appends a byte slice into the builder.
+ ///
+ /// Automatically calls the `append` method to delimit the slice appended in as a
+ /// distinct array element.
+ #[inline]
+ pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> {
+ if self.builder.value_length() != value.as_ref().len() as i32 {
+ return Err(ArrowError::InvalidArgumentError(
+ "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string()
+ ));
+ }
+ self.builder.values().append_slice(value.as_ref())?;
+ self.builder.append(true)
+ }
+
+ /// Append a null value to the array.
+ #[inline]
+ pub fn append_null(&mut self) -> Result<()> {
+ let length: usize = self.builder.value_length() as usize;
+ self.builder.values().append_slice(&vec![0u8; length][..])?;
+ self.builder.append(false)
+ }
+
+ /// Builds the `FixedSizeBinaryArray` and reset this builder.
+ pub fn finish(&mut self) -> FixedSizeBinaryArray {
+ FixedSizeBinaryArray::from(self.builder.finish())
+ }
+}
+
+impl ArrayBuilder for FixedSizeBinaryBuilder {
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut dyn Any {
+ self
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.builder.len()
+ }
+
+ /// Returns whether the number of array slots is zero
+ fn is_empty(&self) -> bool {
+ self.builder.is_empty()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+}
diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs
new file mode 100644
index 000000000..fc64eb0a2
--- /dev/null
+++ b/arrow/src/array/builder/generic_binary_builder.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::array::{
+ ArrayBuilder, ArrayRef, GenericBinaryArray, GenericListBuilder, OffsetSizeTrait,
+ UInt8Builder,
+};
+use crate::error::Result;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Array builder for `BinaryArray`
+#[derive(Debug)]
+pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
+ builder: GenericListBuilder<OffsetSize, UInt8Builder>,
+}
+
+impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
+ /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values
+ /// array
+ pub fn new(capacity: usize) -> Self {
+ let values_builder = UInt8Builder::new(capacity);
+ Self {
+ builder: GenericListBuilder::new(values_builder),
+ }
+ }
+
+ /// Appends a single byte value into the builder's values array.
+ ///
+ /// Note, when appending individual byte values you must call `append` to delimit each
+ /// distinct list value.
+ #[inline]
+ pub fn append_byte(&mut self, value: u8) -> Result<()> {
+ self.builder.values().append_value(value)?;
+ Ok(())
+ }
+
+ /// Appends a byte slice into the builder.
+ ///
+ /// Automatically calls the `append` method to delimit the slice appended in as a
+ /// distinct array element.
+ #[inline]
+ pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> {
+ self.builder.values().append_slice(value.as_ref())?;
+ self.builder.append(true)?;
+ Ok(())
+ }
+
+ /// Finish the current variable-length list array slot.
+ #[inline]
+ pub fn append(&mut self, is_valid: bool) -> Result<()> {
+ self.builder.append(is_valid)
+ }
+
+ /// Append a null value to the array.
+ #[inline]
+ pub fn append_null(&mut self) -> Result<()> {
+ self.append(false)
+ }
+
+ /// Builds the `BinaryArray` and reset this builder.
+ pub fn finish(&mut self) -> GenericBinaryArray<OffsetSize> {
+ GenericBinaryArray::<OffsetSize>::from(self.builder.finish())
+ }
+}
+
+impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut dyn Any {
+ self
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.builder.len()
+ }
+
+ /// Returns whether the number of array slots is zero
+ fn is_empty(&self) -> bool {
+ self.builder.is_empty()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ Arc::new(self.finish())
+ }
+}
diff --git a/arrow/src/array/builder/generic_string_builder.rs b/arrow/src/array/builder/generic_string_builder.rs
new file mode 100644
index 000000000..ee391c4d4
--- /dev/null
+++ b/arrow/src/array/builder/generic_string_builder.rs
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::array::{
+ ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringArray, OffsetSizeTrait,
+ UInt8Builder,
+};
+use crate::error::Result;
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
+ builder: GenericListBuilder<OffsetSize, UInt8Builder>,
+}
+
+impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
+ /// Creates a new `StringBuilder`,
+ /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder
+ pub fn new(capacity: usize) -> Self {
+ let values_builder = UInt8Builder::new(capacity);
+ Self {
+ builder: GenericListBuilder::new(values_builder),
+ }
+ }
+
+ /// Creates a new `StringBuilder`,
+ /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder
+ /// `item_capacity` is the number of items to pre-allocate space for in this builder
+ pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let values_builder = UInt8Builder::new(data_capacity);
+ Self {
+ builder: GenericListBuilder::with_capacity(values_builder, item_capacity),
+ }
+ }
+
+ /// Appends a string into the builder.
+ ///
+ /// Automatically calls the `append` method to delimit the string appended in as a
+ /// distinct array element.
+ #[inline]
+ pub fn append_value(&mut self, value: impl AsRef<str>) -> Result<()> {
+ self.builder
+ .values()
+ .append_slice(value.as_ref().as_bytes())?;
+ self.builder.append(true)?;
+ Ok(())
+ }
+
+ /// Finish the current variable-length list array slot.
+ #[inline]
+ pub fn append(&mut self, is_valid: bool) -> Result<()> {
+ self.builder.append(is_valid)
+ }
+
+ /// Append a null value to the array.
+ #[inline]
+ pub fn append_null(&mut self) -> Result<()> {
+ self.append(false)
+ }
+
+ /// Append an `Option` value to the array.
+ #[inline]
+ pub fn append_option(&mut self, value: Option<impl AsRef<str>>) -> Result<()> {
+ match value {
+ None => self.append_null()?,
+ Some(v) => self.append_value(v)?,
+ };
+ Ok(())
+ }
+
+ /// Builds the `StringArray` and reset this builder.
+ pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
+ GenericStringArray::<OffsetSize>::from(self.builder.finish())
+ }
+}
+
+impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSize> {
+ /// Returns the builder as a non-mutable `Any` reference.
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ /// Returns the builder as a mutable `Any` reference.
+ fn as_any_mut(&mut self) -> &mut dyn Any {
+ self
+ }
+
+ /// Returns the boxed builder as a box of `Any`.
+ fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
+ self
+ }
+
+ /// Returns the number of array slots in the builder
+ fn len(&self) -> usize {
+ self.builder.len()
+ }
+
+ /// Returns whether the number of array slots is zero
+ fn is_empty(&self) -> bool {
+ self.builder.is_empty()
+ }
+
+ /// Builds the array and reset this builder.
+ fn finish(&mut self) -> ArrayRef {
+ let a = GenericStringBuilder::<OffsetSize>::finish(self);
+ Arc::new(a)
+ }
+}
diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs
index 4cd82d9bf..634ef772f 100644
--- a/arrow/src/array/builder/mod.rs
+++ b/arrow/src/array/builder/mod.rs
@@ -24,8 +24,11 @@ mod boolean_buffer_builder;
mod boolean_builder;
mod buffer_builder;
mod decimal_builder;
+mod fixed_size_binary_builder;
mod fixed_size_list_builder;
+mod generic_binary_builder;
mod generic_list_builder;
+mod generic_string_builder;
mod map_builder;
mod primitive_builder;
mod primitive_dictionary_builder;
@@ -38,24 +41,23 @@ use std::marker::PhantomData;
use std::ops::Range;
use super::ArrayRef;
-use super::OffsetSizeTrait;
-use super::UInt8Builder;
pub use boolean_buffer_builder::BooleanBufferBuilder;
pub use boolean_builder::BooleanBuilder;
pub use buffer_builder::BufferBuilder;
pub use decimal_builder::DecimalBuilder;
+pub use fixed_size_binary_builder::FixedSizeBinaryBuilder;
pub use fixed_size_list_builder::FixedSizeListBuilder;
+pub use generic_binary_builder::GenericBinaryBuilder;
pub use generic_list_builder::GenericListBuilder;
+pub use generic_string_builder::GenericStringBuilder;
pub use map_builder::MapBuilder;
pub use primitive_builder::PrimitiveBuilder;
pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder;
pub use string_dictionary_builder::StringDictionaryBuilder;
-pub use struct_builder::StructBuilder;
+pub use struct_builder::{make_builder, StructBuilder};
pub use union_builder::UnionBuilder;
-pub use struct_builder::make_builder;
-
/// Trait for dealing with different array builders at runtime
///
/// # Example
@@ -139,27 +141,8 @@ pub trait ArrayBuilder: Any + Send {
pub type ListBuilder<T> = GenericListBuilder<i32, T>;
pub type LargeListBuilder<T> = GenericListBuilder<i64, T>;
-/// Array builder for `BinaryArray`
-#[derive(Debug)]
-pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
- builder: GenericListBuilder<OffsetSize, UInt8Builder>,
-}
-
pub type BinaryBuilder = GenericBinaryBuilder<i32>;
pub type LargeBinaryBuilder = GenericBinaryBuilder<i64>;
-#[derive(Debug)]
-pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
- builder: GenericListBuilder<OffsetSize, UInt8Builder>,
-}
-
pub type StringBuilder = GenericStringBuilder<i32>;
pub type LargeStringBuilder = GenericStringBuilder<i64>;
-
-#[derive(Debug)]
-pub struct FixedSizeBinaryBuilder {
- builder: FixedSizeListBuilder<UInt8Builder>,
-}
-
-#[cfg(test)]
-mod tests {}
diff --git a/arrow/src/array/builder/union_builder.rs b/arrow/src/array/builder/union_builder.rs
index 78f9a3f4b..95d9ea40a 100644
--- a/arrow/src/array/builder/union_builder.rs
+++ b/arrow/src/array/builder/union_builder.rs
@@ -15,28 +15,22 @@
// specific language governing permissions and limitations
// under the License.
+use std::any::Any;
use std::collections::HashMap;
use crate::array::ArrayDataBuilder;
use crate::array::Int32BufferBuilder;
use crate::array::Int8BufferBuilder;
use crate::array::UnionArray;
-use crate::buffer::MutableBuffer;
+use crate::buffer::Buffer;
-use crate::datatypes::ArrowPrimitiveType;
use crate::datatypes::DataType;
use crate::datatypes::Field;
-use crate::datatypes::IntervalMonthDayNanoType;
-use crate::datatypes::IntervalUnit;
-use crate::datatypes::{Float32Type, Float64Type};
-use crate::datatypes::{Int16Type, Int32Type, Int64Type, Int8Type};
-use crate::datatypes::{UInt16Type, UInt32Type, UInt64Type, UInt8Type};
+use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType};
use crate::error::{ArrowError, Result};
use super::{BooleanBufferBuilder, BufferBuilder};
-use super::buffer_builder::builder_to_mutable_buffer;
-use super::buffer_builder::mutable_buffer_to_builder;
use crate::array::make_array;
/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`.
@@ -47,101 +41,65 @@ struct FieldData {
/// The Arrow data type represented in the `values_buffer`, which is untyped
data_type: DataType,
/// A buffer containing the values for this field in raw bytes
- values_buffer: Option<MutableBuffer>,
+ values_buffer: Box<dyn FieldDataValues>,
/// The number of array slots represented by the buffer
slots: usize,
/// A builder for the null bitmap
bitmap_builder: BooleanBufferBuilder,
}
+/// A type-erased [`BufferBuilder`] used by [`FieldData`]
+trait FieldDataValues: std::fmt::Debug {
+ fn as_mut_any(&mut self) -> &mut dyn Any;
+
+ fn append_null(&mut self);
+
+ fn finish(&mut self) -> Buffer;
+}
+
+impl<T: ArrowNativeType> FieldDataValues for BufferBuilder<T> {
+ fn as_mut_any(&mut self) -> &mut dyn Any {
+ self
+ }
+
+ fn append_null(&mut self) {
+ self.advance(1)
+ }
+
+ fn finish(&mut self) -> Buffer {
+ self.finish()
+ }
+}
+
impl FieldData {
/// Creates a new `FieldData`.
- fn new(type_id: i8, data_type: DataType) -> Self {
+ fn new<T: ArrowPrimitiveType>(type_id: i8, data_type: DataType) -> Self {
Self {
type_id,
data_type,
- values_buffer: Some(MutableBuffer::new(1)),
slots: 0,
+ values_buffer: Box::new(BufferBuilder::<T::Native>::new(1)),
bitmap_builder: BooleanBufferBuilder::new(1),
}
}
/// Appends a single value to this `FieldData`'s `values_buffer`.
- #[allow(clippy::unnecessary_wraps)]
- fn append_to_values_buffer<T: ArrowPrimitiveType>(
- &mut self,
- v: T::Native,
- ) -> Result<()> {
- let values_buffer = self
- .values_buffer
- .take()
- .expect("Values buffer was never created");
- let mut builder: BufferBuilder<T::Native> =
- mutable_buffer_to_builder(values_buffer, self.slots);
- builder.append(v);
- let mutable_buffer = builder_to_mutable_buffer(builder);
- self.values_buffer = Some(mutable_buffer);
+ fn append_value<T: ArrowPrimitiveType>(&mut self, v: T::Native) {
+ self.values_buffer
+ .as_mut_any()
+ .downcast_mut::<BufferBuilder<T::Native>>()
+ .expect("Tried to append unexpected type")
+ .append(v);
- self.slots += 1;
self.bitmap_builder.append(true);
- Ok(())
+ self.slots += 1;
}
/// Appends a null to this `FieldData`.
- #[allow(clippy::unnecessary_wraps)]
- fn append_null<T: ArrowPrimitiveType>(&mut self) -> Result<()> {
- let values_buffer = self
- .values_buffer
- .take()
- .expect("Values buffer was never created");
-
- let mut builder: BufferBuilder<T::Native> =
- mutable_buffer_to_builder(values_buffer, self.slots);
-
- builder.advance(1);
- let mutable_buffer = builder_to_mutable_buffer(builder);
- self.values_buffer = Some(mutable_buffer);
- self.slots += 1;
+ fn append_null(&mut self) {
+ self.values_buffer.append_null();
self.bitmap_builder.append(false);
- Ok(())
- }
-
- /// Appends a null to this `FieldData` when the type is not known at compile time.
- ///
- /// As the main `append` method of `UnionBuilder` is generic, we need a way to append null
- /// slots to the fields that are not being appended to in the case of sparse unions. This
- /// method solves this problem by appending dynamically based on `DataType`.
- ///
- /// Note, this method does **not** update the length of the `UnionArray` (this is done by the
- /// main append operation) and assumes that it is called from a method that is generic over `T`
- /// where `T` satisfies the bound `ArrowPrimitiveType`.
- fn append_null_dynamic(&mut self) -> Result<()> {
- match self.data_type {
- DataType::Null => unimplemented!(),
- DataType::Int8 => self.append_null::<Int8Type>()?,
- DataType::Int16 => self.append_null::<Int16Type>()?,
- DataType::Int32
- | DataType::Date32
- | DataType::Time32(_)
- | DataType::Interval(IntervalUnit::YearMonth) => {
- self.append_null::<Int32Type>()?
- }
- DataType::Int64
- | DataType::Timestamp(_, _)
- | DataType::Date64
- | DataType::Time64(_)
- | DataType::Interval(IntervalUnit::DayTime)
- | DataType::Duration(_) => self.append_null::<Int64Type>()?,
- DataType::Interval(IntervalUnit::MonthDayNano) => self.append_null::<IntervalMonthDayNanoType>()?,
- DataType::UInt8 => self.append_null::<UInt8Type>()?,
- DataType::UInt16 => self.append_null::<UInt16Type>()?,
- DataType::UInt32 => self.append_null::<UInt32Type>()?,
- DataType::UInt64 => self.append_null::<UInt64Type>()?,
- DataType::Float32 => self.append_null::<Float32Type>()?,
- DataType::Float64 => self.append_null::<Float64Type>()?,
- _ => unreachable!("All cases of types that satisfy the trait bounds over T are covered above."),
- };
- Ok(())
+ self.slots += 1;
}
}
@@ -257,11 +215,12 @@ impl UnionBuilder {
data
}
None => match self.value_offset_builder {
- Some(_) => FieldData::new(self.fields.len() as i8, T::DATA_TYPE),
+ Some(_) => FieldData::new::<T>(self.fields.len() as i8, T::DATA_TYPE),
None => {
- let mut fd = FieldData::new(self.fields.len() as i8, T::DATA_TYPE);
+ let mut fd =
+ FieldData::new::<T>(self.fields.len() as i8, T::DATA_TYPE);
for _ in 0..self.len {
- fd.append_null::<T>()?;
+ fd.append_null();
}
fd
}
@@ -278,14 +237,14 @@ impl UnionBuilder {
None => {
for (_, fd) in self.fields.iter_mut() {
// Append to all bar the FieldData currently being appended to
- fd.append_null_dynamic()?;
+ fd.append_null();
}
}
}
match v {
- Some(v) => field_data.append_to_values_buffer::<T>(v)?,
- None => field_data.append_null::<T>()?,
+ Some(v) => field_data.append_value::<T>(v),
+ None => field_data.append_null(),
}
self.fields.insert(type_name, field_data);
@@ -303,15 +262,13 @@ impl UnionBuilder {
FieldData {
type_id,
data_type,
- values_buffer,
+ mut values_buffer,
slots,
mut bitmap_builder,
},
) in self.fields.into_iter()
{
- let buffer = values_buffer
- .expect("The `values_buffer` should only ever be None inside the `append` method.")
- .into();
+ let buffer = values_buffer.finish();
let arr_data_builder = ArrayDataBuilder::new(data_type.clone())
.add_buffer(buffer)
.len(slots)
@@ -333,6 +290,3 @@ impl UnionBuilder {
UnionArray::try_new(&type_ids, type_id_buffer, value_offsets_buffer, children)
}
}
-
-#[cfg(test)]
-mod tests {}