You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/05/25 14:05:21 UTC

[arrow-rs] branch master updated: Add constructors for FixedSize array types (#3879) (#4263)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new cb5d45879 Add constructors for FixedSize array types (#3879) (#4263)
cb5d45879 is described below

commit cb5d45879087fda7b369e565e17545bf3e7e93f2
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu May 25 15:05:14 2023 +0100

    Add constructors for FixedSize array types (#3879) (#4263)
    
    * Add constructors for FixedSize array types (#3879)
    
    * Clippy
---
 arrow-array/src/array/fixed_size_binary_array.rs | 161 +++++++++++++++++-----
 arrow-array/src/array/fixed_size_list_array.rs   | 162 ++++++++++++++++++++++-
 arrow-buffer/src/buffer/null.rs                  |  26 ++++
 arrow-data/src/data/mod.rs                       |  61 ++-------
 4 files changed, 328 insertions(+), 82 deletions(-)

diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs
index 083d71cd9..74a7c4c7a 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -19,7 +19,7 @@ use crate::array::print_long_array;
 use crate::iterator::FixedSizeBinaryIter;
 use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray};
 use arrow_buffer::buffer::NullBuffer;
-use arrow_buffer::{bit_util, Buffer, MutableBuffer};
+use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
 use arrow_schema::{ArrowError, DataType};
 use std::any::Any;
@@ -59,6 +59,78 @@ pub struct FixedSizeBinaryArray {
 }
 
 impl FixedSizeBinaryArray {
+    /// Create a new [`FixedSizeBinaryArray`] with `size` element size, panicking on failure
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`Self::try_new`] returns an error
+    pub fn new(size: i32, values: Buffer, nulls: Option<NullBuffer>) -> Self {
+        Self::try_new(size, values, nulls).unwrap()
+    }
+
+    /// Create a new [`FixedSizeBinaryArray`] from the provided parts, returning an error on failure
+    ///
+    /// # Errors
+    ///
+    /// * `size < 0`
+    /// * `values.len() / size != nulls.len()`
+    pub fn try_new(
+        size: i32,
+        values: Buffer,
+        nulls: Option<NullBuffer>,
+    ) -> Result<Self, ArrowError> {
+        let data_type = DataType::FixedSizeBinary(size);
+        let s = size.to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Size cannot be negative, got {}",
+                size
+            ))
+        })?;
+
+        let len = values.len() / s;
+        if let Some(n) = nulls.as_ref() {
+            if n.len() != len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Incorrect length of null buffer for FixedSizeBinaryArray, expected {} got {}",
+                    len,
+                    n.len(),
+                )));
+            }
+        }
+
+        Ok(Self {
+            data_type,
+            value_data: values,
+            value_length: size,
+            nulls,
+            len,
+        })
+    }
+
+    /// Create a new [`FixedSizeBinaryArray`] of length `len` where all values are null
+    ///
+    /// # Panics
+    ///
+    /// Panics if
+    ///
+    /// * `size < 0`
+    /// * `size * len` would overflow `usize`
+    pub fn new_null(size: i32, len: usize) -> Self {
+        let capacity = size.to_usize().unwrap().checked_mul(len).unwrap();
+        Self {
+            data_type: DataType::FixedSizeBinary(size),
+            value_data: MutableBuffer::new(capacity).into(),
+            nulls: Some(NullBuffer::new_null(len)),
+            value_length: size,
+            len,
+        }
+    }
+
+    /// Deconstruct this array into its constituent parts
+    pub fn into_parts(self) -> (i32, Buffer, Option<NullBuffer>) {
+        (self.value_length, self.value_data, self.nulls)
+    }
+
     /// Returns the element at index `i` as a byte slice.
     /// # Panics
     /// Panics if index `i` is out of bounds.
@@ -215,19 +287,17 @@ impl FixedSizeBinaryArray {
             ));
         }
 
-        let size = size.unwrap_or(0);
-        let array_data = unsafe {
-            ArrayData::new_unchecked(
-                DataType::FixedSizeBinary(size as i32),
-                len,
-                None,
-                Some(null_buf.into()),
-                0,
-                vec![buffer.into()],
-                vec![],
-            )
-        };
-        Ok(FixedSizeBinaryArray::from(array_data))
+        let null_buf = BooleanBuffer::new(null_buf.into(), 0, len);
+        let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0);
+
+        let size = size.unwrap_or(0) as i32;
+        Ok(Self {
+            data_type: DataType::FixedSizeBinary(size),
+            value_data: buffer.into(),
+            nulls,
+            value_length: size,
+            len,
+        })
     }
 
     /// Create an array from an iterable argument of sparse byte slices.
@@ -298,18 +368,16 @@ impl FixedSizeBinaryArray {
             Ok(())
         })?;
 
-        let array_data = unsafe {
-            ArrayData::new_unchecked(
-                DataType::FixedSizeBinary(size),
-                len,
-                None,
-                Some(null_buf.into()),
-                0,
-                vec![buffer.into()],
-                vec![],
-            )
-        };
-        Ok(FixedSizeBinaryArray::from(array_data))
+        let null_buf = BooleanBuffer::new(null_buf.into(), 0, len);
+        let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0);
+
+        Ok(Self {
+            data_type: DataType::FixedSizeBinary(size),
+            value_data: buffer.into(),
+            nulls,
+            len,
+            value_length: size,
+        })
     }
 
     /// Create an array from an iterable argument of byte slices.
@@ -368,12 +436,14 @@ impl FixedSizeBinaryArray {
             ));
         }
 
-        let size = size.unwrap_or(0);
-        let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32))
-            .len(len)
-            .add_buffer(buffer.into());
-        let array_data = unsafe { array_data.build_unchecked() };
-        Ok(FixedSizeBinaryArray::from(array_data))
+        let size = size.unwrap_or(0).try_into().unwrap();
+        Ok(Self {
+            data_type: DataType::FixedSizeBinary(size),
+            value_data: buffer.into(),
+            nulls: None,
+            value_length: size,
+            len,
+        })
     }
 
     #[inline]
@@ -873,4 +943,31 @@ mod tests {
 
         array.value(4);
     }
+
+    #[test]
+    fn test_constructors() {
+        let buffer = Buffer::from_vec(vec![0_u8; 10]);
+        let a = FixedSizeBinaryArray::new(2, buffer.clone(), None);
+        assert_eq!(a.len(), 5);
+
+        let nulls = NullBuffer::new_null(5);
+        FixedSizeBinaryArray::new(2, buffer.clone(), Some(nulls));
+
+        let a = FixedSizeBinaryArray::new(3, buffer.clone(), None);
+        assert_eq!(a.len(), 3);
+
+        let nulls = NullBuffer::new_null(3);
+        FixedSizeBinaryArray::new(3, buffer.clone(), Some(nulls));
+
+        let err = FixedSizeBinaryArray::try_new(-1, buffer.clone(), None).unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Size cannot be negative, got -1"
+        );
+
+        let nulls = NullBuffer::new_null(3);
+        let err = FixedSizeBinaryArray::try_new(2, buffer, Some(nulls)).unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeBinaryArray, expected 5 got 3");
+    }
 }
diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs
index 18fa9df92..3df108ced 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -19,8 +19,9 @@ use crate::array::print_long_array;
 use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
 use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
 use arrow_buffer::buffer::NullBuffer;
+use arrow_buffer::ArrowNativeType;
 use arrow_data::{ArrayData, ArrayDataBuilder};
-use arrow_schema::DataType;
+use arrow_schema::{ArrowError, DataType, FieldRef};
 use std::any::Any;
 use std::sync::Arc;
 
@@ -68,6 +69,114 @@ pub struct FixedSizeListArray {
 }
 
 impl FixedSizeListArray {
+    /// Create a new [`FixedSizeListArray`] with `size` element size, panicking on failure
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`Self::try_new`] returns an error
+    pub fn new(
+        field: FieldRef,
+        size: i32,
+        values: ArrayRef,
+        nulls: Option<NullBuffer>,
+    ) -> Self {
+        Self::try_new(field, size, values, nulls).unwrap()
+    }
+
+    /// Create a new [`FixedSizeListArray`] from the provided parts, returning an error on failure
+    ///
+    /// # Errors
+    ///
+    /// * `size < 0`
+    /// * `values.len() / size != nulls.len()`
+    /// * `values.data_type() != field.data_type()`
+    /// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())`
+    pub fn try_new(
+        field: FieldRef,
+        size: i32,
+        values: ArrayRef,
+        nulls: Option<NullBuffer>,
+    ) -> Result<Self, ArrowError> {
+        let s = size.to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Size cannot be negative, got {}",
+                size
+            ))
+        })?;
+
+        let len = values.len() / s;
+        if let Some(n) = nulls.as_ref() {
+            if n.len() != len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Incorrect length of null buffer for FixedSizeListArray, expected {} got {}",
+                    len,
+                    n.len(),
+                )));
+            }
+        }
+
+        if field.data_type() != values.data_type() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "FixedSizeListArray expected data type {} got {} for {:?}",
+                field.data_type(),
+                values.data_type(),
+                field.name()
+            )));
+        }
+
+        if let Some(a) = values.nulls() {
+            let nulls_valid = field.is_nullable()
+                || nulls
+                    .as_ref()
+                    .map(|n| n.expand(size as _).contains(a))
+                    .unwrap_or_default();
+
+            if !nulls_valid {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Found unmasked nulls for non-nullable FixedSizeListArray field {:?}",
+                    field.name()
+                )));
+            }
+        }
+
+        let data_type = DataType::FixedSizeList(field, size);
+        Ok(Self {
+            data_type,
+            values,
+            value_length: size,
+            nulls,
+            len,
+        })
+    }
+
+    /// Create a new [`FixedSizeListArray`] of length `len` where all values are null
+    ///
+    /// # Panics
+    ///
+    /// Panics if
+    ///
+    /// * `size < 0`
+    /// * `size * len` would overflow `usize`
+    pub fn new_null(field: FieldRef, size: i32, len: usize) -> Self {
+        let capacity = size.to_usize().unwrap().checked_mul(len).unwrap();
+        Self {
+            values: make_array(ArrayData::new_null(field.data_type(), capacity)),
+            data_type: DataType::FixedSizeList(field, size),
+            nulls: Some(NullBuffer::new_null(len)),
+            value_length: size,
+            len,
+        }
+    }
+
+    /// Deconstruct this array into its constituent parts
+    pub fn into_parts(self) -> (FieldRef, i32, ArrayRef, Option<NullBuffer>) {
+        let f = match self.data_type {
+            DataType::FixedSizeList(f, _) => f,
+            _ => unreachable!(),
+        };
+        (f, self.value_length, self.values, self.nulls)
+    }
+
     /// Returns a reference to the values of this list.
     pub fn values(&self) -> &ArrayRef {
         &self.values
@@ -285,7 +394,8 @@ mod tests {
     use super::*;
     use crate::cast::AsArray;
     use crate::types::Int32Type;
-    use arrow_buffer::{bit_util, Buffer};
+    use crate::Int32Array;
+    use arrow_buffer::{bit_util, BooleanBuffer, Buffer};
     use arrow_schema::Field;
 
     #[test]
@@ -460,4 +570,52 @@ mod tests {
 
         list_array.value(10);
     }
+
+    #[test]
+    fn test_fixed_size_list_constructors() {
+        let values = Arc::new(Int32Array::from_iter([
+            Some(1),
+            Some(2),
+            None,
+            None,
+            Some(3),
+            Some(4),
+        ]));
+
+        let field = Arc::new(Field::new("item", DataType::Int32, true));
+        let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), None);
+        assert_eq!(list.len(), 3);
+
+        let nulls = NullBuffer::new_null(3);
+        let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), Some(nulls));
+        assert_eq!(list.len(), 3);
+
+        let list = FixedSizeListArray::new(field.clone(), 4, values.clone(), None);
+        assert_eq!(list.len(), 1);
+
+        let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None)
+            .unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Size cannot be negative, got -1"
+        );
+
+        let nulls = NullBuffer::new_null(2);
+        let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls))
+            .unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2");
+
+        let field = Arc::new(Field::new("item", DataType::Int32, false));
+        let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None)
+            .unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\"");
+
+        // Valid as nulls in child masked by parent
+        let nulls = NullBuffer::new(BooleanBuffer::new(vec![0b0000101].into(), 0, 3));
+        FixedSizeListArray::new(field, 2, values.clone(), Some(nulls));
+
+        let field = Arc::new(Field::new("item", DataType::Int64, true));
+        let err = FixedSizeListArray::try_new(field, 2, values, None).unwrap_err();
+        assert_eq!(err.to_string(), "Invalid argument error: FixedSizeListArray expected data type Int64 got Int32 for \"item\"");
+    }
 }
diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs
index 60987be6e..008d1f04f 100644
--- a/arrow-buffer/src/buffer/null.rs
+++ b/arrow-buffer/src/buffer/null.rs
@@ -76,11 +76,37 @@ impl NullBuffer {
 
     /// Returns true if all nulls in `other` also exist in self
     pub fn contains(&self, other: &NullBuffer) -> bool {
+        if other.null_count == 0 {
+            return true;
+        }
         let lhs = self.inner().bit_chunks().iter_padded();
         let rhs = other.inner().bit_chunks().iter_padded();
         lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
     }
 
+    /// Returns a new [`NullBuffer`] where each bit in the current null buffer
+    /// is repeated `count` times. This is useful for masking the nulls of
+    /// the child of a FixedSizeListArray based on its parent
+    pub fn expand(&self, count: usize) -> Self {
+        let capacity = self.buffer.len().checked_mul(count).unwrap();
+        let mut buffer = MutableBuffer::new_null(capacity);
+
+        // Expand each bit within `null_mask` into `element_len`
+        // bits, constructing the implicit mask of the child elements
+        for i in 0..self.buffer.len() {
+            if self.is_null(i) {
+                continue;
+            }
+            for j in 0..count {
+                crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
+            }
+        }
+        Self {
+            buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
+            null_count: self.null_count * count,
+        }
+    }
+
     /// Returns the length of this [`NullBuffer`]
     #[inline]
     pub fn len(&self) -> usize {
diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs
index 103161f5a..32aae1e92 100644
--- a/arrow-data/src/data/mod.rs
+++ b/arrow-data/src/data/mod.rs
@@ -19,7 +19,6 @@
 //! common attributes and operations for Arrow array.
 
 use crate::bit_iterator::BitSliceIterator;
-use arrow_buffer::bit_chunk_iterator::BitChunks;
 use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
 use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer};
 use arrow_schema::{ArrowError, DataType, UnionMode};
@@ -1143,7 +1142,7 @@ impl ArrayData {
         match &self.data_type {
             DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
                 if !f.is_nullable() {
-                    self.validate_non_nullable(None, 0, &self.child_data[0])?
+                    self.validate_non_nullable(None, &self.child_data[0])?
                 }
             }
             DataType::FixedSizeList(field, len) => {
@@ -1152,40 +1151,17 @@ impl ArrayData {
                     match &self.nulls {
                         Some(nulls) => {
                             let element_len = *len as usize;
-                            let mut buffer =
-                                MutableBuffer::new_null(element_len * self.len);
-
-                            // Expand each bit within `null_mask` into `element_len`
-                            // bits, constructing the implicit mask of the child elements
-                            for i in 0..self.len {
-                                if nulls.is_null(i) {
-                                    continue;
-                                }
-                                for j in 0..element_len {
-                                    bit_util::set_bit(
-                                        buffer.as_mut(),
-                                        i * element_len + j,
-                                    )
-                                }
-                            }
-                            let mask = buffer.into();
-                            self.validate_non_nullable(Some(&mask), 0, child)?;
+                            let expanded = nulls.expand(element_len);
+                            self.validate_non_nullable(Some(&expanded), child)?;
                         }
-                        None => self.validate_non_nullable(None, 0, child)?,
+                        None => self.validate_non_nullable(None, child)?,
                     }
                 }
             }
             DataType::Struct(fields) => {
                 for (field, child) in fields.iter().zip(&self.child_data) {
                     if !field.is_nullable() {
-                        match &self.nulls {
-                            Some(n) => self.validate_non_nullable(
-                                Some(n.buffer()),
-                                n.offset(),
-                                child,
-                            )?,
-                            None => self.validate_non_nullable(None, 0, child)?,
-                        }
+                        self.validate_non_nullable(self.nulls(), child)?
                     }
                 }
             }
@@ -1198,12 +1174,11 @@ impl ArrayData {
     /// Verifies that `child` contains no nulls not present in `mask`
     fn validate_non_nullable(
         &self,
-        mask: Option<&Buffer>,
-        offset: usize,
+        mask: Option<&NullBuffer>,
         child: &ArrayData,
     ) -> Result<(), ArrowError> {
         let mask = match mask {
-            Some(mask) => mask.as_ref(),
+            Some(mask) => mask,
             None => return match child.null_count() {
                 0 => Ok(()),
                 _ => Err(ArrowError::InvalidArgumentError(format!(
@@ -1215,23 +1190,13 @@ impl ArrayData {
         };
 
         match child.nulls() {
-            Some(nulls) => {
-                let mask = BitChunks::new(mask, offset, child.len);
-                let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len);
-                mask
-                    .iter_padded()
-                    .zip(nulls.iter_padded())
-                    .try_for_each(|(m, c)| {
-                    if (m & !c) != 0 {
-                        return Err(ArrowError::InvalidArgumentError(format!(
-                            "non-nullable child of type {} contains nulls not present in parent",
-                            child.data_type
-                        )))
-                    }
-                    Ok(())
-                })
+            Some(nulls) if !mask.contains(nulls) => {
+                Err(ArrowError::InvalidArgumentError(format!(
+                    "non-nullable child of type {} contains nulls not present in parent",
+                    child.data_type
+                )))
             }
-            None => Ok(()),
+            _ => Ok(()),
         }
     }