You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/19 13:05:12 UTC

[arrow-rs] branch master updated: Add ByteArray constructors (#3879) (#4081)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 1097203b1 Add ByteArray constructors (#3879)  (#4081)
1097203b1 is described below

commit 1097203b1650fc2cfbb822d04b47610d84100481
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Apr 19 09:05:05 2023 -0400

    Add ByteArray constructors (#3879)  (#4081)
    
    * Add ByteArray constructors (#3879)
    
    * Clippy
    
    * Make ListArray error message consistent
    
    * Review feedback
---
 arrow-array/src/array/byte_array.rs   | 142 +++++++++++++++++++++++++++++++++-
 arrow-array/src/array/list_array.rs   |   4 +-
 arrow-array/src/array/string_array.rs |  28 +------
 arrow-array/src/types.rs              |  52 ++++++++++++-
 4 files changed, 197 insertions(+), 29 deletions(-)

diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs
index e23079ef9..12f9aab67 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -21,10 +21,10 @@ use crate::iterator::ArrayIter;
 use crate::types::bytes::ByteArrayNativeType;
 use crate::types::ByteArrayType;
 use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait};
-use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
 use arrow_buffer::{NullBuffer, OffsetBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
-use arrow_schema::DataType;
+use arrow_schema::{ArrowError, DataType};
 use std::any::Any;
 use std::sync::Arc;
 
@@ -60,6 +60,87 @@ impl<T: ByteArrayType> GenericByteArray<T> {
     /// Data type of the array.
     pub const DATA_TYPE: DataType = T::DATA_TYPE;
 
+    /// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`GenericByteArray::try_new`] returns an error
+    pub fn new(
+        offsets: OffsetBuffer<T::Offset>,
+        values: Buffer,
+        nulls: Option<NullBuffer>,
+    ) -> Self {
+        Self::try_new(offsets, values, nulls).unwrap()
+    }
+
+    /// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
+    ///
+    /// # Errors
+    ///
+    /// * `offsets.len() - 1 != nulls.len()`
+    /// * Any consecutive pair of `offsets` does not denote a valid slice of `values`
+    pub fn try_new(
+        offsets: OffsetBuffer<T::Offset>,
+        values: Buffer,
+        nulls: Option<NullBuffer>,
+    ) -> Result<Self, ArrowError> {
+        let len = offsets.len() - 1;
+
+        // Verify that each pair of offsets is a valid slices of values
+        T::validate(&offsets, &values)?;
+
+        if let Some(n) = nulls.as_ref() {
+            if n.len() != len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
+                    T::Offset::PREFIX,
+                    T::PREFIX,
+                    n.len(),
+                )));
+            }
+        }
+
+        Ok(Self {
+            data_type: T::DATA_TYPE,
+            value_offsets: offsets,
+            value_data: values,
+            nulls,
+        })
+    }
+
+    /// Create a new [`GenericByteArray`] from the provided parts, without validation
+    ///
+    /// # Safety
+    ///
+    /// Safe if [`Self::try_new`] would not error
+    pub fn new_unchecked(
+        offsets: OffsetBuffer<T::Offset>,
+        values: Buffer,
+        nulls: Option<NullBuffer>,
+    ) -> Self {
+        Self {
+            data_type: T::DATA_TYPE,
+            value_offsets: offsets,
+            value_data: values,
+            nulls,
+        }
+    }
+
+    /// Create a new [`GenericByteArray`] of length `len` where all values are null
+    pub fn new_null(len: usize) -> Self {
+        Self {
+            data_type: T::DATA_TYPE,
+            value_offsets: OffsetBuffer::new_zeroed(len),
+            value_data: MutableBuffer::new(0).into(),
+            nulls: Some(NullBuffer::new_null(len)),
+        }
+    }
+
+    /// Deconstruct this array into its constituent parts
+    pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
+        (self.value_offsets, self.value_data, self.nulls)
+    }
+
     /// Returns the length for value at index `i`.
     /// # Panics
     /// Panics if index `i` is out of bounds.
@@ -374,3 +455,60 @@ impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
         ArrayIter::new(self)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::{BinaryArray, StringArray};
+    use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
+
+    #[test]
+    fn try_new() {
+        let data = Buffer::from_slice_ref("helloworld");
+        let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
+        StringArray::new(offsets.clone(), data.clone(), None);
+
+        let nulls = NullBuffer::new_null(3);
+        let err =
+            StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone()))
+                .unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");
+
+        let err =
+            BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");
+
+        let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
+        let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None)
+            .unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");
+
+        BinaryArray::new(offsets, non_utf8_data, None);
+
+        let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
+        let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Offset of 11 exceeds length of values 10"
+        );
+
+        let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
+        );
+
+        let non_ascii_data = Buffer::from_slice_ref("heìloworld");
+        StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
+        BinaryArray::new(offsets, non_ascii_data.clone(), None);
+
+        let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
+        let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None)
+            .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Split UTF-8 codepoint at offset 3"
+        );
+
+        BinaryArray::new(offsets, non_ascii_data, None);
+    }
+}
diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs
index 7f6f54e4c..f4e5b4b79 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -109,7 +109,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
         if let Some(n) = nulls.as_ref() {
             if n.len() != len {
                 return Err(ArrowError::InvalidArgumentError(format!(
-                    "Incorrect number of nulls for {}ListArray, expected {len} got {}",
+                    "Incorrect length of null buffer for {}ListArray, expected {len} got {}",
                     OffsetSize::PREFIX,
                     n.len(),
                 )));
@@ -1137,7 +1137,7 @@ mod tests {
 
         assert_eq!(
             err.to_string(),
-            "Invalid argument error: Incorrect number of nulls for LargeListArray, expected 4 got 3"
+            "Invalid argument error: Incorrect length of null buffer for LargeListArray, expected 4 got 3"
         );
 
         let field = Arc::new(Field::new("element", DataType::Int64, false));
diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index e042f29c2..7c4a37529 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -16,9 +16,7 @@
 // under the License.
 
 use crate::types::GenericStringType;
-use crate::{
-    Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait,
-};
+use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
 use arrow_buffer::{bit_util, MutableBuffer};
 use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType};
@@ -105,27 +103,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
     pub fn try_from_binary(
         v: GenericBinaryArray<OffsetSize>,
     ) -> Result<Self, ArrowError> {
-        let offsets = v.value_offsets();
-        let values = v.value_data();
-
-        // We only need to validate that all values are valid UTF-8
-        let validated = std::str::from_utf8(values).map_err(|e| {
-            ArrowError::CastError(format!("Encountered non UTF-8 data: {e}"))
-        })?;
-
-        for offset in offsets.iter() {
-            let o = offset.as_usize();
-            if !validated.is_char_boundary(o) {
-                return Err(ArrowError::CastError(format!(
-                    "Split UTF-8 codepoint at offset {o}"
-                )));
-            }
-        }
-
-        let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
-        // SAFETY:
-        // Validated UTF-8 above
-        Ok(Self::from(unsafe { builder.build_unchecked() }))
+        let (offsets, values, nulls) = v.into_parts();
+        Self::try_new(offsets, values, nulls)
     }
 }
 
@@ -261,6 +240,7 @@ mod tests {
     use super::*;
     use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
     use crate::types::UInt8Type;
+    use crate::Array;
     use arrow_buffer::Buffer;
     use arrow_schema::Field;
     use std::sync::Arc;
diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
index cec78db9f..b50018ca9 100644
--- a/arrow-array/src/types.rs
+++ b/arrow-array/src/types.rs
@@ -19,7 +19,7 @@
 
 use crate::delta::shift_months;
 use crate::{ArrowNativeTypeOp, OffsetSizeTrait};
-use arrow_buffer::i256;
+use arrow_buffer::{i256, Buffer, OffsetBuffer};
 use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision};
 use arrow_schema::{
     ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION,
@@ -1526,10 +1526,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed {
     /// Utf8Array will have native type has &str
     /// BinaryArray will have type as [u8]
     type Native: bytes::ByteArrayNativeType + AsRef<Self::Native> + AsRef<[u8]> + ?Sized;
+
     /// "Binary" or "String", for use in error messages
     const PREFIX: &'static str;
+
     /// Datatype of array elements
     const DATA_TYPE: DataType;
+
+    /// Verifies that every consecutive pair of `offsets` denotes a valid slice of `values`
+    fn validate(
+        offsets: &OffsetBuffer<Self::Offset>,
+        values: &Buffer,
+    ) -> Result<(), ArrowError>;
 }
 
 /// [`ByteArrayType`] for string arrays
@@ -1547,6 +1555,33 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {
     } else {
         DataType::Utf8
     };
+
+    fn validate(
+        offsets: &OffsetBuffer<Self::Offset>,
+        values: &Buffer,
+    ) -> Result<(), ArrowError> {
+        // Verify that the slice as a whole is valid UTF-8
+        let validated = std::str::from_utf8(values).map_err(|e| {
+            ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
+        })?;
+
+        // Verify each offset is at a valid character boundary in this UTF-8 array
+        for offset in offsets.iter() {
+            let o = offset.as_usize();
+            if !validated.is_char_boundary(o) {
+                if o < validated.len() {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Split UTF-8 codepoint at offset {o}"
+                    )));
+                }
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Offset of {o} exceeds length of values {}",
+                    validated.len()
+                )));
+            }
+        }
+        Ok(())
+    }
 }
 
 /// An arrow utf8 array with i32 offsets
@@ -1569,6 +1604,21 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericBinaryType<O> {
     } else {
         DataType::Binary
     };
+
+    fn validate(
+        offsets: &OffsetBuffer<Self::Offset>,
+        values: &Buffer,
+    ) -> Result<(), ArrowError> {
+        // offsets are guaranteed to be monotonically increasing and non-empty
+        let max_offset = offsets.last().unwrap().as_usize();
+        if values.len() < max_offset {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Maximum offset of {max_offset} is larger than values of length {}",
+                values.len()
+            )));
+        }
+        Ok(())
+    }
 }
 
 /// An arrow binary array with i32 offsets