You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/05/25 14:05:21 UTC
[arrow-rs] branch master updated: Add constructors for FixedSize array types (#3879) (#4263)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new cb5d45879 Add constructors for FixedSize array types (#3879) (#4263)
cb5d45879 is described below
commit cb5d45879087fda7b369e565e17545bf3e7e93f2
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu May 25 15:05:14 2023 +0100
Add constructors for FixedSize array types (#3879) (#4263)
* Add constructors for FixedSize array types (#3879)
* Clippy
---
arrow-array/src/array/fixed_size_binary_array.rs | 161 +++++++++++++++++-----
arrow-array/src/array/fixed_size_list_array.rs | 162 ++++++++++++++++++++++-
arrow-buffer/src/buffer/null.rs | 26 ++++
arrow-data/src/data/mod.rs | 61 ++-------
4 files changed, 328 insertions(+), 82 deletions(-)
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs
index 083d71cd9..74a7c4c7a 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -19,7 +19,7 @@ use crate::array::print_long_array;
use crate::iterator::FixedSizeBinaryIter;
use crate::{Array, ArrayAccessor, ArrayRef, FixedSizeListArray};
use arrow_buffer::buffer::NullBuffer;
-use arrow_buffer::{bit_util, Buffer, MutableBuffer};
+use arrow_buffer::{bit_util, ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
@@ -59,6 +59,78 @@ pub struct FixedSizeBinaryArray {
}
impl FixedSizeBinaryArray {
+ /// Create a new [`FixedSizeBinaryArray`] with `size` element size, panicking on failure
+ ///
+ /// # Panics
+ ///
+ /// Panics if [`Self::try_new`] returns an error
+ pub fn new(size: i32, values: Buffer, nulls: Option<NullBuffer>) -> Self {
+ Self::try_new(size, values, nulls).unwrap()
+ }
+
+ /// Create a new [`FixedSizeBinaryArray`] from the provided parts, returning an error on failure
+ ///
+ /// # Errors
+ ///
+ /// * `size < 0`
+ /// * `values.len() / size != nulls.len()`
+ pub fn try_new(
+ size: i32,
+ values: Buffer,
+ nulls: Option<NullBuffer>,
+ ) -> Result<Self, ArrowError> {
+ let data_type = DataType::FixedSizeBinary(size);
+ let s = size.to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Size cannot be negative, got {}",
+ size
+ ))
+ })?;
+
+ let len = values.len() / s;
+ if let Some(n) = nulls.as_ref() {
+ if n.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect length of null buffer for FixedSizeBinaryArray, expected {} got {}",
+ len,
+ n.len(),
+ )));
+ }
+ }
+
+ Ok(Self {
+ data_type,
+ value_data: values,
+ value_length: size,
+ nulls,
+ len,
+ })
+ }
+
+ /// Create a new [`FixedSizeBinaryArray`] of length `len` where all values are null
+ ///
+ /// # Panics
+ ///
+ /// Panics if
+ ///
+ /// * `size < 0`
+ /// * `size * len` would overflow `usize`
+ pub fn new_null(size: i32, len: usize) -> Self {
+ let capacity = size.to_usize().unwrap().checked_mul(len).unwrap();
+ Self {
+ data_type: DataType::FixedSizeBinary(size),
+ value_data: MutableBuffer::new(capacity).into(),
+ nulls: Some(NullBuffer::new_null(len)),
+ value_length: size,
+ len,
+ }
+ }
+
+ /// Deconstruct this array into its constituent parts
+ pub fn into_parts(self) -> (i32, Buffer, Option<NullBuffer>) {
+ (self.value_length, self.value_data, self.nulls)
+ }
+
/// Returns the element at index `i` as a byte slice.
/// # Panics
/// Panics if index `i` is out of bounds.
@@ -215,19 +287,17 @@ impl FixedSizeBinaryArray {
));
}
- let size = size.unwrap_or(0);
- let array_data = unsafe {
- ArrayData::new_unchecked(
- DataType::FixedSizeBinary(size as i32),
- len,
- None,
- Some(null_buf.into()),
- 0,
- vec![buffer.into()],
- vec![],
- )
- };
- Ok(FixedSizeBinaryArray::from(array_data))
+ let null_buf = BooleanBuffer::new(null_buf.into(), 0, len);
+ let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0);
+
+ let size = size.unwrap_or(0) as i32;
+ Ok(Self {
+ data_type: DataType::FixedSizeBinary(size),
+ value_data: buffer.into(),
+ nulls,
+ value_length: size,
+ len,
+ })
}
/// Create an array from an iterable argument of sparse byte slices.
@@ -298,18 +368,16 @@ impl FixedSizeBinaryArray {
Ok(())
})?;
- let array_data = unsafe {
- ArrayData::new_unchecked(
- DataType::FixedSizeBinary(size),
- len,
- None,
- Some(null_buf.into()),
- 0,
- vec![buffer.into()],
- vec![],
- )
- };
- Ok(FixedSizeBinaryArray::from(array_data))
+ let null_buf = BooleanBuffer::new(null_buf.into(), 0, len);
+ let nulls = Some(NullBuffer::new(null_buf)).filter(|n| n.null_count() > 0);
+
+ Ok(Self {
+ data_type: DataType::FixedSizeBinary(size),
+ value_data: buffer.into(),
+ nulls,
+ len,
+ value_length: size,
+ })
}
/// Create an array from an iterable argument of byte slices.
@@ -368,12 +436,14 @@ impl FixedSizeBinaryArray {
));
}
- let size = size.unwrap_or(0);
- let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32))
- .len(len)
- .add_buffer(buffer.into());
- let array_data = unsafe { array_data.build_unchecked() };
- Ok(FixedSizeBinaryArray::from(array_data))
+ let size = size.unwrap_or(0).try_into().unwrap();
+ Ok(Self {
+ data_type: DataType::FixedSizeBinary(size),
+ value_data: buffer.into(),
+ nulls: None,
+ value_length: size,
+ len,
+ })
}
#[inline]
@@ -873,4 +943,31 @@ mod tests {
array.value(4);
}
+
+ #[test]
+ fn test_constructors() {
+ let buffer = Buffer::from_vec(vec![0_u8; 10]);
+ let a = FixedSizeBinaryArray::new(2, buffer.clone(), None);
+ assert_eq!(a.len(), 5);
+
+ let nulls = NullBuffer::new_null(5);
+ FixedSizeBinaryArray::new(2, buffer.clone(), Some(nulls));
+
+ let a = FixedSizeBinaryArray::new(3, buffer.clone(), None);
+ assert_eq!(a.len(), 3);
+
+ let nulls = NullBuffer::new_null(3);
+ FixedSizeBinaryArray::new(3, buffer.clone(), Some(nulls));
+
+ let err = FixedSizeBinaryArray::try_new(-1, buffer.clone(), None).unwrap_err();
+
+ assert_eq!(
+ err.to_string(),
+ "Invalid argument error: Size cannot be negative, got -1"
+ );
+
+ let nulls = NullBuffer::new_null(3);
+ let err = FixedSizeBinaryArray::try_new(2, buffer, Some(nulls)).unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeBinaryArray, expected 5 got 3");
+ }
}
diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs
index 18fa9df92..3df108ced 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -19,8 +19,9 @@ use crate::array::print_long_array;
use crate::builder::{FixedSizeListBuilder, PrimitiveBuilder};
use crate::{make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType};
use arrow_buffer::buffer::NullBuffer;
+use arrow_buffer::ArrowNativeType;
use arrow_data::{ArrayData, ArrayDataBuilder};
-use arrow_schema::DataType;
+use arrow_schema::{ArrowError, DataType, FieldRef};
use std::any::Any;
use std::sync::Arc;
@@ -68,6 +69,114 @@ pub struct FixedSizeListArray {
}
impl FixedSizeListArray {
+ /// Create a new [`FixedSizeListArray`] with `size` element size, panicking on failure
+ ///
+ /// # Panics
+ ///
+ /// Panics if [`Self::try_new`] returns an error
+ pub fn new(
+ field: FieldRef,
+ size: i32,
+ values: ArrayRef,
+ nulls: Option<NullBuffer>,
+ ) -> Self {
+ Self::try_new(field, size, values, nulls).unwrap()
+ }
+
+ /// Create a new [`FixedSizeListArray`] from the provided parts, returning an error on failure
+ ///
+ /// # Errors
+ ///
+ /// * `size < 0`
+ /// * `values.len() / size != nulls.len()`
+ /// * `values.data_type() != field.data_type()`
+ /// * `!field.is_nullable() && !nulls.expand(size).contains(values.nulls())`
+ pub fn try_new(
+ field: FieldRef,
+ size: i32,
+ values: ArrayRef,
+ nulls: Option<NullBuffer>,
+ ) -> Result<Self, ArrowError> {
+ let s = size.to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Size cannot be negative, got {}",
+ size
+ ))
+ })?;
+
+ let len = values.len() / s;
+ if let Some(n) = nulls.as_ref() {
+ if n.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect length of null buffer for FixedSizeListArray, expected {} got {}",
+ len,
+ n.len(),
+ )));
+ }
+ }
+
+ if field.data_type() != values.data_type() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "FixedSizeListArray expected data type {} got {} for {:?}",
+ field.data_type(),
+ values.data_type(),
+ field.name()
+ )));
+ }
+
+ if let Some(a) = values.nulls() {
+ let nulls_valid = field.is_nullable()
+ || nulls
+ .as_ref()
+ .map(|n| n.expand(size as _).contains(a))
+ .unwrap_or_default();
+
+ if !nulls_valid {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Found unmasked nulls for non-nullable FixedSizeListArray field {:?}",
+ field.name()
+ )));
+ }
+ }
+
+ let data_type = DataType::FixedSizeList(field, size);
+ Ok(Self {
+ data_type,
+ values,
+ value_length: size,
+ nulls,
+ len,
+ })
+ }
+
+ /// Create a new [`FixedSizeListArray`] of length `len` where all values are null
+ ///
+ /// # Panics
+ ///
+ /// Panics if
+ ///
+ /// * `size < 0`
+ /// * `size * len` would overflow `usize`
+ pub fn new_null(field: FieldRef, size: i32, len: usize) -> Self {
+ let capacity = size.to_usize().unwrap().checked_mul(len).unwrap();
+ Self {
+ values: make_array(ArrayData::new_null(field.data_type(), capacity)),
+ data_type: DataType::FixedSizeList(field, size),
+ nulls: Some(NullBuffer::new_null(len)),
+ value_length: size,
+ len,
+ }
+ }
+
+ /// Deconstruct this array into its constituent parts
+ pub fn into_parts(self) -> (FieldRef, i32, ArrayRef, Option<NullBuffer>) {
+ let f = match self.data_type {
+ DataType::FixedSizeList(f, _) => f,
+ _ => unreachable!(),
+ };
+ (f, self.value_length, self.values, self.nulls)
+ }
+
/// Returns a reference to the values of this list.
pub fn values(&self) -> &ArrayRef {
&self.values
@@ -285,7 +394,8 @@ mod tests {
use super::*;
use crate::cast::AsArray;
use crate::types::Int32Type;
- use arrow_buffer::{bit_util, Buffer};
+ use crate::Int32Array;
+ use arrow_buffer::{bit_util, BooleanBuffer, Buffer};
use arrow_schema::Field;
#[test]
@@ -460,4 +570,52 @@ mod tests {
list_array.value(10);
}
+
+ #[test]
+ fn test_fixed_size_list_constructors() {
+ let values = Arc::new(Int32Array::from_iter([
+ Some(1),
+ Some(2),
+ None,
+ None,
+ Some(3),
+ Some(4),
+ ]));
+
+ let field = Arc::new(Field::new("item", DataType::Int32, true));
+ let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), None);
+ assert_eq!(list.len(), 3);
+
+ let nulls = NullBuffer::new_null(3);
+ let list = FixedSizeListArray::new(field.clone(), 2, values.clone(), Some(nulls));
+ assert_eq!(list.len(), 3);
+
+ let list = FixedSizeListArray::new(field.clone(), 4, values.clone(), None);
+ assert_eq!(list.len(), 1);
+
+ let err = FixedSizeListArray::try_new(field.clone(), -1, values.clone(), None)
+ .unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Invalid argument error: Size cannot be negative, got -1"
+ );
+
+ let nulls = NullBuffer::new_null(2);
+ let err = FixedSizeListArray::try_new(field, 2, values.clone(), Some(nulls))
+ .unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for FixedSizeListArray, expected 3 got 2");
+
+ let field = Arc::new(Field::new("item", DataType::Int32, false));
+ let err = FixedSizeListArray::try_new(field.clone(), 2, values.clone(), None)
+ .unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: Found unmasked nulls for non-nullable FixedSizeListArray field \"item\"");
+
+ // Valid as nulls in child masked by parent
+ let nulls = NullBuffer::new(BooleanBuffer::new(vec![0b0000101].into(), 0, 3));
+ FixedSizeListArray::new(field, 2, values.clone(), Some(nulls));
+
+ let field = Arc::new(Field::new("item", DataType::Int64, true));
+ let err = FixedSizeListArray::try_new(field, 2, values, None).unwrap_err();
+ assert_eq!(err.to_string(), "Invalid argument error: FixedSizeListArray expected data type Int64 got Int32 for \"item\"");
+ }
}
diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs
index 60987be6e..008d1f04f 100644
--- a/arrow-buffer/src/buffer/null.rs
+++ b/arrow-buffer/src/buffer/null.rs
@@ -76,11 +76,37 @@ impl NullBuffer {
/// Returns true if all nulls in `other` also exist in self
pub fn contains(&self, other: &NullBuffer) -> bool {
+ if other.null_count == 0 {
+ return true;
+ }
let lhs = self.inner().bit_chunks().iter_padded();
let rhs = other.inner().bit_chunks().iter_padded();
lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
}
+ /// Returns a new [`NullBuffer`] where each bit in the current null buffer
+ /// is repeated `count` times. This is useful for masking the nulls of
+ /// the child of a FixedSizeListArray based on its parent
+ pub fn expand(&self, count: usize) -> Self {
+ let capacity = self.buffer.len().checked_mul(count).unwrap();
+ let mut buffer = MutableBuffer::new_null(capacity);
+
+ // Expand each bit within `null_mask` into `element_len`
+ // bits, constructing the implicit mask of the child elements
+ for i in 0..self.buffer.len() {
+ if self.is_null(i) {
+ continue;
+ }
+ for j in 0..count {
+ crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
+ }
+ }
+ Self {
+ buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
+ null_count: self.null_count * count,
+ }
+ }
+
/// Returns the length of this [`NullBuffer`]
#[inline]
pub fn len(&self) -> usize {
diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs
index 103161f5a..32aae1e92 100644
--- a/arrow-data/src/data/mod.rs
+++ b/arrow-data/src/data/mod.rs
@@ -19,7 +19,6 @@
//! common attributes and operations for Arrow array.
use crate::bit_iterator::BitSliceIterator;
-use arrow_buffer::bit_chunk_iterator::BitChunks;
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer};
use arrow_schema::{ArrowError, DataType, UnionMode};
@@ -1143,7 +1142,7 @@ impl ArrayData {
match &self.data_type {
DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
if !f.is_nullable() {
- self.validate_non_nullable(None, 0, &self.child_data[0])?
+ self.validate_non_nullable(None, &self.child_data[0])?
}
}
DataType::FixedSizeList(field, len) => {
@@ -1152,40 +1151,17 @@ impl ArrayData {
match &self.nulls {
Some(nulls) => {
let element_len = *len as usize;
- let mut buffer =
- MutableBuffer::new_null(element_len * self.len);
-
- // Expand each bit within `null_mask` into `element_len`
- // bits, constructing the implicit mask of the child elements
- for i in 0..self.len {
- if nulls.is_null(i) {
- continue;
- }
- for j in 0..element_len {
- bit_util::set_bit(
- buffer.as_mut(),
- i * element_len + j,
- )
- }
- }
- let mask = buffer.into();
- self.validate_non_nullable(Some(&mask), 0, child)?;
+ let expanded = nulls.expand(element_len);
+ self.validate_non_nullable(Some(&expanded), child)?;
}
- None => self.validate_non_nullable(None, 0, child)?,
+ None => self.validate_non_nullable(None, child)?,
}
}
}
DataType::Struct(fields) => {
for (field, child) in fields.iter().zip(&self.child_data) {
if !field.is_nullable() {
- match &self.nulls {
- Some(n) => self.validate_non_nullable(
- Some(n.buffer()),
- n.offset(),
- child,
- )?,
- None => self.validate_non_nullable(None, 0, child)?,
- }
+ self.validate_non_nullable(self.nulls(), child)?
}
}
}
@@ -1198,12 +1174,11 @@ impl ArrayData {
/// Verifies that `child` contains no nulls not present in `mask`
fn validate_non_nullable(
&self,
- mask: Option<&Buffer>,
- offset: usize,
+ mask: Option<&NullBuffer>,
child: &ArrayData,
) -> Result<(), ArrowError> {
let mask = match mask {
- Some(mask) => mask.as_ref(),
+ Some(mask) => mask,
None => return match child.null_count() {
0 => Ok(()),
_ => Err(ArrowError::InvalidArgumentError(format!(
@@ -1215,23 +1190,13 @@ impl ArrayData {
};
match child.nulls() {
- Some(nulls) => {
- let mask = BitChunks::new(mask, offset, child.len);
- let nulls = BitChunks::new(nulls.validity(), nulls.offset(), child.len);
- mask
- .iter_padded()
- .zip(nulls.iter_padded())
- .try_for_each(|(m, c)| {
- if (m & !c) != 0 {
- return Err(ArrowError::InvalidArgumentError(format!(
- "non-nullable child of type {} contains nulls not present in parent",
- child.data_type
- )))
- }
- Ok(())
- })
+ Some(nulls) if !mask.contains(nulls) => {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "non-nullable child of type {} contains nulls not present in parent",
+ child.data_type
+ )))
}
- None => Ok(()),
+ _ => Ok(()),
}
}