You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/19 13:05:12 UTC
[arrow-rs] branch master updated: Add ByteArray constructors (#3879) (#4081)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 1097203b1 Add ByteArray constructors (#3879) (#4081)
1097203b1 is described below
commit 1097203b1650fc2cfbb822d04b47610d84100481
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Apr 19 09:05:05 2023 -0400
Add ByteArray constructors (#3879) (#4081)
* Add ByteArray constructors (#3879)
* Clippy
* Make ListArray error message consistent
* Review feedback
---
arrow-array/src/array/byte_array.rs | 142 +++++++++++++++++++++++++++++++++-
arrow-array/src/array/list_array.rs | 4 +-
arrow-array/src/array/string_array.rs | 28 +------
arrow-array/src/types.rs | 52 ++++++++++++-
4 files changed, 197 insertions(+), 29 deletions(-)
diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs
index e23079ef9..12f9aab67 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -21,10 +21,10 @@ use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait};
-use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_buffer::{NullBuffer, OffsetBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
-use arrow_schema::DataType;
+use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
@@ -60,6 +60,87 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Data type of the array.
pub const DATA_TYPE: DataType = T::DATA_TYPE;
+ /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
+ ///
+ /// # Panics
+ ///
+ /// Panics if [`GenericByteArray::try_new`] returns an error
+ pub fn new(
+ offsets: OffsetBuffer<T::Offset>,
+ values: Buffer,
+ nulls: Option<NullBuffer>,
+ ) -> Self {
+ Self::try_new(offsets, values, nulls).unwrap()
+ }
+
+ /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
+ ///
+ /// # Errors
+ ///
+ /// * `offsets.len() - 1 != nulls.len()`
+ /// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
+ pub fn try_new(
+ offsets: OffsetBuffer<T::Offset>,
+ values: Buffer,
+ nulls: Option<NullBuffer>,
+ ) -> Result<Self, ArrowError> {
+ let len = offsets.len() - 1;
+
+ // Verify that each pair of offsets is a valid slices of values
+ T::validate(&offsets, &values)?;
+
+ if let Some(n) = nulls.as_ref() {
+ if n.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
+ T::Offset::PREFIX,
+ T::PREFIX,
+ n.len(),
+ )));
+ }
+ }
+
+ Ok(Self {
+ data_type: T::DATA_TYPE,
+ value_offsets: offsets,
+ value_data: values,
+ nulls,
+ })
+ }
+
+ /// Create a new [`GenericByteArray`] from the provided parts, without validation
+ ///
+ /// # Safety
+ ///
+ /// Safe if [`Self::try_new`] would not error
+ pub fn new_unchecked(
+ offsets: OffsetBuffer<T::Offset>,
+ values: Buffer,
+ nulls: Option<NullBuffer>,
+ ) -> Self {
+ Self {
+ data_type: T::DATA_TYPE,
+ value_offsets: offsets,
+ value_data: values,
+ nulls,
+ }
+ }
+
+ /// Create a new [`GenericByteArray`] of length `len` where all values are null
+ pub fn new_null(len: usize) -> Self {
+ Self {
+ data_type: T::DATA_TYPE,
+ value_offsets: OffsetBuffer::new_zeroed(len),
+ value_data: MutableBuffer::new(0).into(),
+ nulls: Some(NullBuffer::new_null(len)),
+ }
+ }
+
+ /// Deconstruct this array into its constituent parts
+ pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
+ (self.value_offsets, self.value_data, self.nulls)
+ }
+
/// Returns the length for value at index `i`.
/// # Panics
/// Panics if index `i` is out of bounds.
@@ -374,3 +455,60 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
ArrayIter::new(self)
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::{BinaryArray, StringArray};
+ use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
+
+ #[test]
+ fn try_new() {
+ let data = Buffer::from_slice_ref("helloworld");
+ let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
+ StringArray::new(offsets.clone(), data.clone(), None);
+
+ let nulls = NullBuffer::new_null(3);
+ let err =
+ StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone()))
+ .unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");
+
+ let err =
+ BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");
+
+ let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
+ let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None)
+ .unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");
+
+ BinaryArray::new(offsets, non_utf8_data, None);
+
+ let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
+ let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Invalid argument error: Offset of 11 exceeds length of values 10"
+ );
+
+ let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
+ );
+
+ let non_ascii_data = Buffer::from_slice_ref("heìloworld");
+ StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
+ BinaryArray::new(offsets, non_ascii_data.clone(), None);
+
+ let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
+ let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None)
+ .unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Invalid argument error: Split UTF-8 codepoint at offset 3"
+ );
+
+ BinaryArray::new(offsets, non_ascii_data, None);
+ }
+}
diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs
index 7f6f54e4c..f4e5b4b79 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -109,7 +109,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
- "Incorrect number of nulls for {}ListArray, expected {len} got {}",
+ "Incorrect length of null buffer for {}ListArray, expected {len} got {}",
OffsetSize::PREFIX,
n.len(),
)));
@@ -1137,7 +1137,7 @@ mod tests {
assert_eq!(
err.to_string(),
- "Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3"
+ "Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3"
);
let field = Arc::new(Field::new("element", DataType::Int64, false));
diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index e042f29c2..7c4a37529 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -16,9 +16,7 @@
// under the License.
use crate::types::GenericStringType;
-use crate::{
- Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait,
-};
+use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
use arrow_buffer::{bit_util, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
@@ -105,27 +103,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
pub fn try_from_binary(
v: GenericBinaryArray<OffsetSize>,
) -> Result<Self, ArrowError> {
- let offsets = v.value_offsets();
- let values = v.value_data();
-
- // We only need to validate that all values are valid UTF-8
- let validated = std::str::from_utf8(values).map_err(|e| {
- ArrowError::CastError(format!("Encountered non UTF-8 data: {e}"))
- })?;
-
- for offset in offsets.iter() {
- let o = offset.as_usize();
- if !validated.is_char_boundary(o) {
- return Err(ArrowError::CastError(format!(
- "Split UTF-8 codepoint at offset {o}"
- )));
- }
- }
-
- let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
- // SAFETY:
- // Validated UTF-8 above
- Ok(Self::from(unsafe { builder.build_unchecked() }))
+ let (offsets, values, nulls) = v.into_parts();
+ Self::try_new(offsets, values, nulls)
}
}
@@ -261,6 +240,7 @@ mod tests {
use super::*;
use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
use crate::types::UInt8Type;
+ use crate::Array;
use arrow_buffer::Buffer;
use arrow_schema::Field;
use std::sync::Arc;
diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
index cec78db9f..b50018ca9 100644
--- a/arrow-array/src/types.rs
+++ b/arrow-array/src/types.rs
@@ -19,7 +19,7 @@
use crate::delta::shift_months;
use crate::{ArrowNativeTypeOp, OffsetSizeTrait};
-use arrow_buffer::i256;
+use arrow_buffer::{i256, Buffer, OffsetBuffer};
use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision};
use arrow_schema::{
ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION,
@@ -1526,10 +1526,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed {
/// Utf8Array will have native type has &str
/// BinaryArray will have type as [u8]
type Native: bytes::ByteArrayNativeType + AsRef<Self::Native> + AsRef<[u8]> + ?Sized;
+
/// "Binary" or "String", for use in error messages
const PREFIX: &'static str;
+
/// Datatype of array elements
const DATA_TYPE: DataType;
+
+ /// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values`
+ fn validate(
+ offsets: &OffsetBuffer<Self::Offset>,
+ values: &Buffer,
+ ) -> Result<(), ArrowError>;
}
/// [`ByteArrayType`] for string arrays
@@ -1547,6 +1555,33 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {
} else {
DataType::Utf8
};
+
+ fn validate(
+ offsets: &OffsetBuffer<Self::Offset>,
+ values: &Buffer,
+ ) -> Result<(), ArrowError> {
+ // Verify that the slice as a whole is valid UTF-8
+ let validated = std::str::from_utf8(values).map_err(|e| {
+ ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
+ })?;
+
+ // Verify each offset is at a valid character boundary in this UTF-8 array
+ for offset in offsets.iter() {
+ let o = offset.as_usize();
+ if !validated.is_char_boundary(o) {
+ if o < validated.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Split UTF-8 codepoint at offset {o}"
+ )));
+ }
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Offset of {o} exceeds length of values {}",
+ validated.len()
+ )));
+ }
+ }
+ Ok(())
+ }
}
/// An arrow utf8 array with i32 offsets
@@ -1569,6 +1604,21 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericBinaryType<O> {
} else {
DataType::Binary
};
+
+ fn validate(
+ offsets: &OffsetBuffer<Self::Offset>,
+ values: &Buffer,
+ ) -> Result<(), ArrowError> {
+ // offsets are guaranteed to be monotonically increasing and non-empty
+ let max_offset = offsets.last().unwrap().as_usize();
+ if values.len() < max_offset {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Maximum offset of {max_offset} is larger than values of length {}",
+ values.len()
+ )));
+ }
+ Ok(())
+ }
}
/// An arrow binary array with i32 offsets