You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/11/09 11:47:53 UTC
[arrow-rs] 01/01: Validate arguments to ArrayData::new and null bit
buffer and buffers (#810)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch cherry_pick_74b520c4
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 542103a4604b3f87404c91176b1a0e800eeebd04
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Nov 8 14:11:10 2021 -0500
Validate arguments to ArrayData::new and null bit buffer and buffers (#810)
* Validate arguments to ArrayData::new: null bit buffer and buffers
* REname is_int_type to is_dictionary_key_type()
* Correctly handle self.offset in offsets buffer
* Consolidate checks
* Fix test output
---
arrow/src/array/array_binary.rs | 41 +-
arrow/src/array/array_boolean.rs | 11 +-
arrow/src/array/array_list.rs | 84 ++--
arrow/src/array/array_map.rs | 4 +-
arrow/src/array/array_primitive.rs | 14 +-
arrow/src/array/array_union.rs | 5 +-
arrow/src/array/data.rs | 916 +++++++++++++++++++++++++++++++++++--
arrow/src/compute/kernels/cast.rs | 1 +
arrow/src/compute/util.rs | 5 +-
arrow/src/datatypes/datatype.rs | 10 +
arrow/src/ipc/reader.rs | 17 +-
11 files changed, 1004 insertions(+), 104 deletions(-)
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index 89a3efd..51743d8 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -891,10 +891,18 @@ mod tests {
assert!(binary_array.is_valid(i));
assert!(!binary_array.is_null(i));
}
+ }
+
+ #[test]
+ fn test_binary_array_with_offsets() {
+ let values: [u8; 12] = [
+ b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
+ ];
+ let offsets: [i32; 4] = [0, 5, 5, 12];
// Test binary array with offset
let array_data = ArrayData::builder(DataType::Binary)
- .len(4)
+ .len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_buffer(Buffer::from_slice_ref(&values))
@@ -947,10 +955,18 @@ mod tests {
assert!(binary_array.is_valid(i));
assert!(!binary_array.is_null(i));
}
+ }
+
+ #[test]
+ fn test_large_binary_array_with_offsets() {
+ let values: [u8; 12] = [
+ b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
+ ];
+ let offsets: [i64; 4] = [0, 5, 5, 12];
// Test binary array with offset
let array_data = ArrayData::builder(DataType::LargeBinary)
- .len(4)
+ .len(2)
.offset(1)
.add_buffer(Buffer::from_slice_ref(&offsets))
.add_buffer(Buffer::from_slice_ref(&values))
@@ -1196,26 +1212,25 @@ mod tests {
#[test]
#[should_panic(
- expected = "FixedSizeBinaryArray can only be created from list array of u8 values \
- (i.e. FixedSizeList<PrimitiveArray<u8>>)."
+ expected = "FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays"
)]
fn test_fixed_size_binary_array_from_incorrect_list_array() {
let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt32)
.len(12)
.add_buffer(Buffer::from_slice_ref(&values))
- .add_child_data(ArrayData::builder(DataType::Boolean).build().unwrap())
.build()
.unwrap();
- let array_data = ArrayData::builder(DataType::FixedSizeList(
- Box::new(Field::new("item", DataType::Binary, false)),
- 4,
- ))
- .len(3)
- .add_child_data(values_data)
- .build()
- .unwrap();
+ let array_data = unsafe {
+ ArrayData::builder(DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::Binary, false)),
+ 4,
+ ))
+ .len(3)
+ .add_child_data(values_data)
+ .build_unchecked()
+ };
let list_array = FixedSizeListArray::from(array_data);
FixedSizeBinaryArray::from(list_array);
}
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index 07f3da6..3b19594 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -332,10 +332,11 @@ mod tests {
#[should_panic(expected = "BooleanArray data should contain a single buffer only \
(values buffer)")]
fn test_boolean_array_invalid_buffer_len() {
- let data = ArrayData::builder(DataType::Boolean)
- .len(5)
- .build()
- .unwrap();
- BooleanArray::from(data);
+ let data = unsafe {
+ ArrayData::builder(DataType::Boolean)
+ .len(5)
+ .build_unchecked()
+ };
+ drop(BooleanArray::from(data));
}
}
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index fbba8fc..2d2c62a 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -552,9 +552,10 @@ mod tests {
assert!(!list_array.is_null(i));
}
- // Now test with a non-zero offset
+ // Now test with a non-zero offset (skip first element)
+ // [[3, 4, 5], [6, 7]]
let list_data = ArrayData::builder(list_data_type)
- .len(3)
+ .len(2)
.offset(1)
.add_buffer(value_offsets)
.add_child_data(value_data.clone())
@@ -565,7 +566,7 @@ mod tests {
let values = list_array.values();
assert_eq!(&value_data, values.data());
assert_eq!(DataType::Int32, list_array.value_type());
- assert_eq!(3, list_array.len());
+ assert_eq!(2, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(6, list_array.value_offsets()[1]);
assert_eq!(2, list_array.value_length(1));
@@ -642,8 +643,9 @@ mod tests {
}
// Now test with a non-zero offset
+ // [[3, 4, 5], [6, 7]]
let list_data = ArrayData::builder(list_data_type)
- .len(3)
+ .len(2)
.offset(1)
.add_buffer(value_offsets)
.add_child_data(value_data.clone())
@@ -654,7 +656,7 @@ mod tests {
let values = list_array.values();
assert_eq!(&value_data, values.data());
assert_eq!(DataType::Int32, list_array.value_type());
- assert_eq!(3, list_array.len());
+ assert_eq!(2, list_array.len());
assert_eq!(0, list_array.null_count());
assert_eq!(6, list_array.value_offsets()[1]);
assert_eq!(2, list_array.value_length(1));
@@ -763,12 +765,13 @@ mod tests {
Box::new(Field::new("item", DataType::Int32, false)),
3,
);
- let list_data = ArrayData::builder(list_data_type)
- .len(3)
- .add_child_data(value_data)
- .build()
- .unwrap();
- FixedSizeListArray::from(list_data);
+ let list_data = unsafe {
+ ArrayData::builder(list_data_type)
+ .len(3)
+ .add_child_data(value_data)
+ .build_unchecked()
+ };
+ drop(FixedSizeListArray::from(list_data));
}
#[test]
@@ -1038,19 +1041,21 @@ mod tests {
expected = "ListArray data should contain a single buffer only (value offsets)"
)]
fn test_list_array_invalid_buffer_len() {
- let value_data = ArrayData::builder(DataType::Int32)
- .len(8)
- .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]))
- .build()
- .unwrap();
+ let value_data = unsafe {
+ ArrayData::builder(DataType::Int32)
+ .len(8)
+ .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]))
+ .build_unchecked()
+ };
let list_data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
- let list_data = ArrayData::builder(list_data_type)
- .len(3)
- .add_child_data(value_data)
- .build()
- .unwrap();
- ListArray::from(list_data);
+ let list_data = unsafe {
+ ArrayData::builder(list_data_type)
+ .len(3)
+ .add_child_data(value_data)
+ .build_unchecked()
+ };
+ drop(ListArray::from(list_data));
}
#[test]
@@ -1061,12 +1066,13 @@ mod tests {
let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]);
let list_data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
- let list_data = ArrayData::builder(list_data_type)
- .len(3)
- .add_buffer(value_offsets)
- .build()
- .unwrap();
- ListArray::from(list_data);
+ let list_data = unsafe {
+ ArrayData::builder(list_data_type)
+ .len(3)
+ .add_buffer(value_offsets)
+ .build_unchecked()
+ };
+ drop(ListArray::from(list_data));
}
#[test]
@@ -1112,19 +1118,21 @@ mod tests {
let buf2 = buf.slice(1);
let values: [i32; 8] = [0; 8];
- let value_data = ArrayData::builder(DataType::Int32)
- .add_buffer(Buffer::from_slice_ref(&values))
- .build()
- .unwrap();
+ let value_data = unsafe {
+ ArrayData::builder(DataType::Int32)
+ .add_buffer(Buffer::from_slice_ref(&values))
+ .build_unchecked()
+ };
let list_data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
- let list_data = ArrayData::builder(list_data_type)
- .add_buffer(buf2)
- .add_child_data(value_data)
- .build()
- .unwrap();
- ListArray::from(list_data);
+ let list_data = unsafe {
+ ArrayData::builder(list_data_type)
+ .add_buffer(buf2)
+ .add_child_data(value_data)
+ .build_unchecked()
+ };
+ drop(ListArray::from(list_data));
}
#[test]
diff --git a/arrow/src/array/array_map.rs b/arrow/src/array/array_map.rs
index bd888ff..caccebc 100644
--- a/arrow/src/array/array_map.rs
+++ b/arrow/src/array/array_map.rs
@@ -320,7 +320,7 @@ mod tests {
// Now test with a non-zero offset
let map_data = ArrayData::builder(map_array.data_type().clone())
- .len(3)
+ .len(2)
.offset(1)
.add_buffer(map_array.data().buffers()[0].clone())
.add_child_data(map_array.data().child_data()[0].clone())
@@ -331,7 +331,7 @@ mod tests {
let values = map_array.values();
assert_eq!(&value_data, values.data());
assert_eq!(DataType::UInt32, map_array.value_type());
- assert_eq!(3, map_array.len());
+ assert_eq!(2, map_array.len());
assert_eq!(0, map_array.null_count());
assert_eq!(6, map_array.value_offsets()[1]);
assert_eq!(2, map_array.value_length(1));
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index a93e703..ac56b4a 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -894,7 +894,7 @@ mod tests {
#[test]
fn test_primitive_array_builder() {
// Test building a primitive array with ArrayData builder and offset
- let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]);
+ let buf = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4, 5, 6]);
let buf2 = buf.clone();
let data = ArrayData::builder(DataType::Int32)
.len(5)
@@ -950,8 +950,16 @@ mod tests {
#[should_panic(expected = "PrimitiveArray data should contain a single buffer only \
(values buffer)")]
fn test_primitive_array_invalid_buffer_len() {
- let data = ArrayData::builder(DataType::Int32).len(5).build().unwrap();
- Int32Array::from(data);
+ let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]);
+ let data = unsafe {
+ ArrayData::builder(DataType::Int32)
+ .add_buffer(buffer.clone())
+ .add_buffer(buffer)
+ .len(5)
+ .build_unchecked()
+ };
+
+ drop(Int32Array::from(data));
}
#[test]
diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs
index 6460e07..56efcfb 100644
--- a/arrow/src/array/array_union.rs
+++ b/arrow/src/array/array_union.rs
@@ -137,7 +137,10 @@ impl UnionArray {
}
}
- Ok(Self::new(type_ids, value_offsets, child_arrays, bitmap))
+ let new_self = Self::new(type_ids, value_offsets, child_arrays, bitmap);
+ new_self.data().validate()?;
+
+ Ok(new_self)
}
/// Accesses the child array for `type_id`.
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index dbc5434..3ca14d8 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -18,11 +18,12 @@
//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates
//! common attributes and operations for Arrow array.
+use std::convert::TryInto;
use std::mem;
use std::sync::Arc;
use crate::datatypes::{DataType, IntervalUnit};
-use crate::error::Result;
+use crate::error::{ArrowError, Result};
use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};
use crate::{
buffer::{Buffer, MutableBuffer},
@@ -189,6 +190,28 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
}
}
+/// Ensures that at least `min_size` elements of type `data_type` can
+/// be stored in a buffer of `buffer_size`.
+///
+/// `buffer_index` is used in error messages to identify which buffer
+/// had the invalid index
+fn ensure_size(
+ data_type: &DataType,
+ min_size: usize,
+ buffer_size: usize,
+ buffer_index: usize,
+) -> Result<()> {
+ // if min_size is zero, may not have buffers (e.g. NullArray)
+ if min_size > 0 && buffer_size < min_size {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
+ buffer_size, buffer_index, data_type, min_size
+ )))
+ } else {
+ Ok(())
+ }
+}
+
/// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`.
#[inline]
pub(crate) fn into_buffers(
@@ -313,18 +336,6 @@ impl ArrayData {
Ok(new_self)
}
- /// Validates that buffers in this ArrayData are sufficiently
- /// sized, to store `len` + `offset` total elements of
- /// `data_type`.
- ///
- /// This check is "cheap" in the sense that it does not validate the
- /// contents of the buffers (e.g. that string offsets for UTF8 arrays
- /// are within the length of the buffer).
- pub fn validate(&self) -> Result<()> {
- // will be filled in a subsequent PR
- Ok(())
- }
-
/// Returns a builder to construct a `ArrayData` instance.
#[inline]
pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
@@ -559,6 +570,437 @@ impl ArrayData {
)
}
}
+
+ /// "cheap" validation of an `ArrayData`. Ensures buffers are
+ /// sufficiently sized to store `len` + `offset` total elements of
+ /// `data_type` and performs other inexpensive consistency checks.
+ ///
+ /// This check is "cheap" in the sense that it does not validate the
+ /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+ /// are within the bounds of the values buffer).
+ ///
+ /// TODO: add a validate_full that validates the offsets and valid utf8 data
+ pub fn validate(&self) -> Result<()> {
+ // Need at least this mich space in each buffer
+ let len_plus_offset = self.len + self.offset;
+
+ // Check that the data layout conforms to the spec
+ let layout = layout(&self.data_type);
+
+ // Will validate Union when conforms to new spec:
+ // https://github.com/apache/arrow-rs/issues/85
+ if matches!(&self.data_type, DataType::Union(_)) {
+ return Ok(());
+ }
+ if self.buffers.len() != layout.buffers.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Expected {} buffers in array of type {:?}, got {}",
+ layout.buffers.len(),
+ self.data_type,
+ self.buffers.len(),
+ )));
+ }
+
+ for (i, (buffer, spec)) in
+ self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+ {
+ match spec {
+ BufferSpec::FixedWidth { byte_width } => {
+ let min_buffer_size = len_plus_offset
+ .checked_mul(*byte_width)
+ .expect("integer overflow computing min buffer size");
+
+ if buffer.len() < min_buffer_size {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
+ min_buffer_size, i, self.data_type, buffer.len()
+ )));
+ }
+ }
+ BufferSpec::VariableWidth => {
+ // not cheap to validate (need to look at the
+ // data). Partially checked in validate_offsets
+ // called below. Can check with `validate_full`
+ }
+ BufferSpec::BitMap => {
+ let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+ if buffer.len() < min_buffer_size {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
+ min_buffer_size, i, self.data_type, buffer.len()
+ )));
+ }
+ }
+ BufferSpec::AlwaysNull => {
+ // Nothing to validate
+ }
+ }
+ }
+
+ if self.null_count > self.len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "null_count {} for an array exceeds length of {} elements",
+ self.null_count, self.len
+ )));
+ }
+
+ // check null bit buffer size
+ if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+ let needed_len = bit_util::ceil(len_plus_offset, 8);
+ if null_bit_buffer.len() < needed_len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "null_bit_buffer size too small. got {} needed {}",
+ null_bit_buffer.len(),
+ needed_len
+ )));
+ }
+ } else if self.null_count > 0 {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Array of type {} has {} nulls but no null bitmap",
+ self.data_type, self.null_count
+ )));
+ }
+
+ self.validate_child_data()?;
+
+ // Additional Type specific checks
+ match &self.data_type {
+ DataType::Utf8 | DataType::Binary => {
+ self.validate_offsets::<i32>(&self.buffers[0], self.buffers[1].len())?;
+ }
+ DataType::LargeUtf8 | DataType::LargeBinary => {
+ self.validate_offsets::<i64>(&self.buffers[0], self.buffers[1].len())?;
+ }
+ DataType::Dictionary(key_type, _value_type) => {
+ // At the moment, constructing a DictionaryArray will also check this
+ if !DataType::is_dictionary_key_type(key_type) {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Dictionary values must be integer, but was {}",
+ key_type
+ )));
+ }
+ }
+ _ => {}
+ };
+
+ Ok(())
+ }
+
+ /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
+ /// offsets (of type T> into some other buffer of `values_length` bytes long
+ fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+ &self,
+ buffer: &Buffer,
+ values_length: usize,
+ ) -> Result<()> {
+ // Validate that there are the correct number of offsets for this array's length
+ let required_offsets = self.len + self.offset + 1;
+
+ // An empty list-like array can have 0 offsets
+ if buffer.is_empty() {
+ return Ok(());
+ }
+
+ if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}",
+ buffer.len(), self.data_type, self.len, required_offsets
+ )));
+ }
+
+ // Justification: buffer size was validated above
+ let offsets = unsafe { &(buffer.typed_data::<T>()[self.offset..]) };
+
+ let first_offset = offsets[0].to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Error converting offset[0] ({}) to usize for {}",
+ offsets[0], self.data_type
+ ))
+ })?;
+
+ let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Error converting offset[{}] ({}) to usize for {}",
+ self.len, offsets[self.len], self.data_type
+ ))
+ })?;
+
+ if first_offset > values_length {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "First offset {} of {} is larger than values length {}",
+ first_offset, self.data_type, values_length,
+ )));
+ }
+
+ if last_offset > values_length {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Last offset {} of {} is larger than values length {}",
+ last_offset, self.data_type, values_length,
+ )));
+ }
+
+ if first_offset > last_offset {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "First offset {} in {} is smaller than last offset {}",
+ first_offset, self.data_type, last_offset,
+ )));
+ }
+
+ Ok(())
+ }
+
+ /// Validates the layout of `child_data` ArrayData structures
+ fn validate_child_data(&self) -> Result<()> {
+ match &self.data_type {
+ DataType::List(field) | DataType::Map(field, _) => {
+ let values_data = self.get_single_valid_child_data(field.data_type())?;
+ self.validate_offsets::<i32>(&self.buffers[0], values_data.len)?;
+ Ok(())
+ }
+ DataType::LargeList(field) => {
+ let values_data = self.get_single_valid_child_data(field.data_type())?;
+ self.validate_offsets::<i64>(&self.buffers[0], values_data.len)?;
+ Ok(())
+ }
+ DataType::FixedSizeList(field, list_size) => {
+ let values_data = self.get_single_valid_child_data(field.data_type())?;
+
+ let list_size: usize = (*list_size).try_into().map_err(|_| {
+ ArrowError::InvalidArgumentError(format!(
+ "{} has a negative list_size {}",
+ self.data_type, list_size
+ ))
+ })?;
+
+ let expected_values_len = self.len
+ .checked_mul(list_size)
+ .expect("integer overflow computing expected number of expected values in FixedListSize");
+
+ if values_data.len < expected_values_len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
+ values_data.len, list_size, list_size, self.data_type
+ )));
+ }
+
+ Ok(())
+ }
+ DataType::Struct(fields) => {
+ self.validate_num_child_data(fields.len())?;
+ for (i, field) in fields.iter().enumerate() {
+ let field_data = self.get_valid_child_data(i, field.data_type())?;
+
+ // C++ does this check, but it is not clear why
+ // field_data checks only len, but self checks len+offset
+ if field_data.len < (self.len + self.offset) {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
+ self.data_type, i, field.name(), field_data.len, self.len + self.offset
+ )));
+ }
+ }
+ Ok(())
+ }
+ DataType::Union(_fields) => {
+ // Validate Union Array as part of implementing new Union semantics
+ // See comments in `ArrayData::validate()`
+ // https://github.com/apache/arrow-rs/issues/85
+ Ok(())
+ }
+ DataType::Dictionary(_key_type, value_type) => {
+ self.get_single_valid_child_data(value_type)?;
+ Ok(())
+ }
+ _ => {
+ // other types do not have child data
+ if !self.child_data.is_empty() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Expected no child arrays for type {} but got {}",
+ self.data_type,
+ self.child_data.len()
+ )));
+ }
+ Ok(())
+ }
+ }
+ }
+
+ /// Ensures that this array data has a single child_data with the
+ /// expected type, and calls `validate()` on it. Returns a
+ /// reference to that child_data
+ fn get_single_valid_child_data(
+ &self,
+ expected_type: &DataType,
+ ) -> Result<&ArrayData> {
+ self.validate_num_child_data(1)?;
+ self.get_valid_child_data(0, expected_type)
+ }
+
+ /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
+ fn validate_num_child_data(&self, expected_len: usize) -> Result<()> {
+ if self.child_data().len() != expected_len {
+ Err(ArrowError::InvalidArgumentError(format!(
+ "Value data for {} should contain {} child data array(s), had {}",
+ self.data_type(),
+ expected_len,
+ self.child_data.len()
+ )))
+ } else {
+ Ok(())
+ }
+ }
+
+ /// Ensures that `child_data[i]` has the expected type, calls
+ /// `validate()` on it, and returns a reference to that child_data
+ fn get_valid_child_data(
+ &self,
+ i: usize,
+ expected_type: &DataType,
+ ) -> Result<&ArrayData> {
+ let values_data = self.child_data
+ .get(i)
+ .ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "{} did not have enough child arrays. Expected at least {} but had only {}",
+ self.data_type, i+1, self.child_data.len()
+ ))
+ })?;
+
+ if expected_type != &values_data.data_type {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Child type mismatch for {}. Expected {} but child data had {}",
+ self.data_type, expected_type, values_data.data_type
+ )));
+ }
+
+ values_data.validate()?;
+ Ok(values_data)
+ }
+}
+
+/// Return the expected [`DataTypeLayout`] Arrays of this data
+/// type are expected to have
+fn layout(data_type: &DataType) -> DataTypeLayout {
+ // based on C/C++ implementation in
+ // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
+ use std::mem::size_of;
+ match data_type {
+ DataType::Null => DataTypeLayout::new_empty(),
+ DataType::Boolean => DataTypeLayout {
+ buffers: vec![BufferSpec::BitMap],
+ },
+ DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::<i8>()),
+ DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::<i16>()),
+ DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+ DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+ DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::<u8>()),
+ DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::<u16>()),
+ DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::<u32>()),
+ DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::<u64>()),
+ DataType::Float16 => unimplemented!(),
+ DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::<f32>()),
+ DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::<f64>()),
+ DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+ DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+ DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+ DataType::Time32(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+ DataType::Time64(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+ DataType::Interval(IntervalUnit::YearMonth) => {
+ DataTypeLayout::new_fixed_width(size_of::<i32>())
+ }
+ DataType::Interval(IntervalUnit::DayTime) => {
+ DataTypeLayout::new_fixed_width(size_of::<i64>())
+ }
+ DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+ DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()),
+ DataType::FixedSizeBinary(bytes_per_value) => {
+ let bytes_per_value: usize = (*bytes_per_value)
+ .try_into()
+ .expect("negative size for fixed size binary");
+ DataTypeLayout::new_fixed_width(bytes_per_value)
+ }
+ DataType::LargeBinary => DataTypeLayout::new_binary(size_of::<i64>()),
+ DataType::Utf8 => DataTypeLayout::new_binary(size_of::<i32>()),
+ DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::<i64>()),
+ DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+ DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data
+ DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+ DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data,
+ DataType::Union(_) => {
+ DataTypeLayout::new_fixed_width(size_of::<u8>())
+ // Note sparse unions only have one buffer (u8) type_ids,
+ // and dense unions have 2 (type_ids as well as offsets).
+ // https://github.com/apache/arrow-rs/issues/85
+ }
+ DataType::Dictionary(key_type, _value_type) => layout(key_type),
+ DataType::Decimal(_, _) => {
+ // Decimals are always some fixed width; The rust implemenation
+ // always uses 16 bytes / size of i128
+ DataTypeLayout::new_fixed_width(size_of::<i128>())
+ }
+ DataType::Map(_, _) => {
+ // same as ListType
+ DataTypeLayout::new_fixed_width(size_of::<i32>())
+ }
+ }
+}
+
+/// Layout specification for a data type
+#[derive(Debug, PartialEq)]
+// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
+struct DataTypeLayout {
+ /// A vector of buffer layout specifications, one for each expected buffer
+ pub buffers: Vec<BufferSpec>,
+}
+
+impl DataTypeLayout {
+ /// Describes a basic numeric array where each element has a fixed width
+ pub fn new_fixed_width(byte_width: usize) -> Self {
+ Self {
+ buffers: vec![BufferSpec::FixedWidth { byte_width }],
+ }
+ }
+
+ /// Describes arrays which have no data of their own
+ /// (e.g. FixedSizeList). Note such arrays may still have a Null
+ /// Bitmap
+ pub fn new_empty() -> Self {
+ Self { buffers: vec![] }
+ }
+
+ /// Describes a basic numeric array where each element has a fixed
+ /// with offset buffer of `offset_byte_width` bytes, followed by a
+ /// variable width data buffer
+ pub fn new_binary(offset_byte_width: usize) -> Self {
+ Self {
+ buffers: vec![
+ // offsets
+ BufferSpec::FixedWidth {
+ byte_width: offset_byte_width,
+ },
+ // values
+ BufferSpec::VariableWidth,
+ ],
+ }
+ }
+}
+
+/// Layout specification for a single data type buffer
+#[derive(Debug, PartialEq)]
+enum BufferSpec {
+ /// each element has a fixed width
+ FixedWidth { byte_width: usize },
+ /// Variable width, such as string data for utf8 data
+ VariableWidth,
+ /// Buffer holds a bitmap.
+ ///
+ /// Note: Unlike the C++ implementation, the null/validity buffer
+ /// is handled specially rather than as another of the buffers in
+ /// the spec, so this variant is only used for the Boolean type.
+ BitMap,
+ /// Buffer is always null. Unused currently in Rust implementation,
+ /// (used in C++ for Union type)
+ AlwaysNull,
}
impl PartialEq for ArrayData {
@@ -672,23 +1114,38 @@ impl ArrayDataBuilder {
mod tests {
use super::*;
+ use crate::array::{Array, Int32Array, StringArray};
use crate::buffer::Buffer;
+ use crate::datatypes::Field;
use crate::util::bit_util;
#[test]
- fn test_new() {
- let arr_data =
- ArrayData::try_new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![])
- .unwrap();
- assert_eq!(10, arr_data.len());
- assert_eq!(1, arr_data.null_count());
- assert_eq!(2, arr_data.offset());
- assert_eq!(0, arr_data.buffers().len());
- assert_eq!(0, arr_data.child_data().len());
+ fn test_builder() {
+ // Buffer needs to be at least 25 long
+ let v = (0..25).collect::<Vec<i32>>();
+ let b1 = Buffer::from_slice_ref(&v);
+ let arr_data = ArrayData::builder(DataType::Int32)
+ .len(20)
+ .offset(5)
+ .add_buffer(b1)
+ .null_bit_buffer(Buffer::from(vec![
+ 0b01011111, 0b10110101, 0b01100011, 0b00011110,
+ ]))
+ .build()
+ .unwrap();
+
+ assert_eq!(20, arr_data.len());
+ assert_eq!(10, arr_data.null_count());
+ assert_eq!(5, arr_data.offset());
+ assert_eq!(1, arr_data.buffers().len());
+ assert_eq!(
+ Buffer::from_slice_ref(&v).as_slice(),
+ arr_data.buffers()[0].as_slice()
+ );
}
#[test]
- fn test_builder() {
+ fn test_builder_with_child_data() {
let child_arr_data = ArrayData::try_new(
DataType::Int32,
5,
@@ -699,24 +1156,17 @@ mod tests {
vec![],
)
.unwrap();
- let v = vec![0, 1, 2, 3];
- let b1 = Buffer::from(&v[..]);
- let arr_data = ArrayData::builder(DataType::Int32)
- .len(20)
- .offset(5)
- .add_buffer(b1)
- .null_bit_buffer(Buffer::from(vec![
- 0b01011111, 0b10110101, 0b01100011, 0b00011110,
- ]))
+
+ let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]);
+
+ let arr_data = ArrayData::builder(data_type)
+ .len(5)
+ .offset(0)
.add_child_data(child_arr_data.clone())
.build()
.unwrap();
- assert_eq!(20, arr_data.len());
- assert_eq!(10, arr_data.null_count());
- assert_eq!(5, arr_data.offset());
- assert_eq!(1, arr_data.buffers().len());
- assert_eq!(&[0, 1, 2, 3], arr_data.buffers()[0].as_slice());
+ assert_eq!(5, arr_data.len());
assert_eq!(1, arr_data.child_data().len());
assert_eq!(child_arr_data, arr_data.child_data()[0]);
}
@@ -729,6 +1179,7 @@ mod tests {
bit_util::set_bit(&mut bit_v, 10);
let arr_data = ArrayData::builder(DataType::Int32)
.len(16)
+ .add_buffer(make_i32_buffer(16))
.null_bit_buffer(Buffer::from(bit_v))
.build()
.unwrap();
@@ -742,6 +1193,7 @@ mod tests {
let arr_data = ArrayData::builder(DataType::Int32)
.len(12)
.offset(2)
+ .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
.null_bit_buffer(Buffer::from(bit_v))
.build()
.unwrap();
@@ -756,6 +1208,7 @@ mod tests {
bit_util::set_bit(&mut bit_v, 10);
let arr_data = ArrayData::builder(DataType::Int32)
.len(16)
+ .add_buffer(make_i32_buffer(16))
.null_bit_buffer(Buffer::from(bit_v))
.build()
.unwrap();
@@ -771,6 +1224,7 @@ mod tests {
bit_util::set_bit(&mut bit_v, 10);
let data = ArrayData::builder(DataType::Int32)
.len(16)
+ .add_buffer(make_i32_buffer(16))
.null_bit_buffer(Buffer::from(bit_v))
.build()
.unwrap();
@@ -788,8 +1242,16 @@ mod tests {
#[test]
fn test_equality() {
- let int_data = ArrayData::builder(DataType::Int32).build().unwrap();
- let float_data = ArrayData::builder(DataType::Float32).build().unwrap();
+ let int_data = ArrayData::builder(DataType::Int32)
+ .len(1)
+ .add_buffer(make_i32_buffer(1))
+ .build()
+ .unwrap();
+ let float_data = ArrayData::builder(DataType::Float32)
+ .len(1)
+ .add_buffer(make_f32_buffer(1))
+ .build()
+ .unwrap();
assert_ne!(int_data, float_data);
}
@@ -802,4 +1264,380 @@ mod tests {
let count = count_nulls(null_buffer.as_ref(), 4, 8);
assert_eq!(count, 3);
}
+
+ #[test]
+ #[should_panic(
+ expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8"
+ )]
+ fn test_buffer_too_small() {
+ let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+ // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8)
+ ArrayData::try_new(DataType::Int64, 10, Some(0), None, 0, vec![buffer], vec![])
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8"
+ )]
+ fn test_buffer_too_small_offset() {
+ let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+ // should fail -- size is ok, but also has offset
+ ArrayData::try_new(DataType::Int64, 1, Some(0), None, 1, vec![buffer], vec![])
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")]
+ fn test_bad_number_of_buffers() {
+ let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]);
+ let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]);
+ ArrayData::try_new(
+ DataType::Int64,
+ 1,
+ Some(0),
+ None,
+ 0,
+ vec![buffer1, buffer2],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "integer overflow computing min buffer size")]
+ fn test_fixed_width_overflow() {
+ let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+ ArrayData::try_new(
+ DataType::Int64,
+ usize::MAX,
+ Some(0),
+ None,
+ 0,
+ vec![buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "null_bit_buffer size too small. got 8 needed 13")]
+ fn test_bitmap_too_small() {
+ let buffer = make_i32_buffer(100);
+ let null_bit_buffer = Buffer::from(vec![0b11111111]);
+
+ ArrayData::try_new(
+ DataType::Int32,
+ 100,
+ Some(0),
+ Some(null_bit_buffer),
+ 0,
+ vec![buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "null_count 3 for an array exceeds length of 2 elements")]
+ fn test_bad_null_count() {
+ let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+ ArrayData::try_new(DataType::Int32, 2, Some(3), None, 0, vec![buffer], vec![])
+ .unwrap();
+ }
+
+ // Test creating a dictionary with a non integer type
+ #[test]
+ #[should_panic(expected = "Dictionary values must be integer, but was Utf8")]
+ fn test_non_int_dictionary() {
+ let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+ let data_type =
+ DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32));
+ let child_data = ArrayData::try_new(
+ DataType::Int32,
+ 1,
+ Some(0),
+ None,
+ 0,
+ vec![i32_buffer.clone()],
+ vec![],
+ )
+ .unwrap();
+ ArrayData::try_new(
+ data_type,
+ 1,
+ Some(0),
+ None,
+ 0,
+ vec![i32_buffer.clone(), i32_buffer],
+ vec![child_data],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")]
+ fn test_mismatched_dictionary_types() {
+ // test w/ dictionary created with a child array data that has type different than declared
+ let string_array: StringArray =
+ vec![Some("foo"), Some("bar")].into_iter().collect();
+ let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]);
+ // Dict says LargeUtf8 but array is Utf8
+ let data_type = DataType::Dictionary(
+ Box::new(DataType::Int32),
+ Box::new(DataType::LargeUtf8),
+ );
+ let child_data = string_array.data().clone();
+ ArrayData::try_new(
+ data_type,
+ 1,
+ Some(0),
+ None,
+ 0,
+ vec![i32_buffer],
+ vec![child_data],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3"
+ )]
+ fn test_validate_offsets_i32() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Offsets buffer size (bytes): 16 isn't large enough for LargeUtf8. Length 2 needs 3"
+ )]
+ fn test_validate_offsets_i64() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]);
+ ArrayData::try_new(
+ DataType::LargeUtf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")]
+ fn test_validate_offsets_negative_first_i32() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")]
+ fn test_validate_offsets_negative_last_i32() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")]
+ fn test_validate_offsets_range_too_small() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ // start offset is larger than end
+ let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")]
+ fn test_validate_offsets_range_too_large() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ // 10 is off the end of the buffer
+ let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")]
+ fn test_validate_offsets_first_too_large() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ // 10 is off the end of the buffer
+ let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ fn test_validate_offsets_first_too_large_skipped() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ // 10 is off the end of the buffer, but offset starts at 1 so it is skipped
+ let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]);
+ let data = ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 1,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ let array: StringArray = data.into();
+ let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect();
+ assert_eq!(array, expected);
+ }
+
+ #[test]
+ #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")]
+ fn test_validate_offsets_last_too_large() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ // 10 is off the end of the buffer
+ let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 2,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"
+ )]
+ fn test_validate_fixed_size_list() {
+ // child has 4 elements,
+ let child_array = vec![Some(1), Some(2), Some(3), None]
+ .into_iter()
+ .collect::<Int32Array>();
+
+ // but claim we have 3 elements for a fixed size of 2
+ // 10 is off the end of the buffer
+ let field = Field::new("field", DataType::Int32, true);
+ ArrayData::try_new(
+ DataType::FixedSizeList(Box::new(field), 2),
+ 3,
+ None,
+ None,
+ 0,
+ vec![],
+ vec![child_array.data().clone()],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "Child type mismatch for Struct")]
+ fn test_validate_struct_child_type() {
+ let field1 = vec![Some(1), Some(2), Some(3), None]
+ .into_iter()
+ .collect::<Int32Array>();
+
+ // validate the the type of struct fields matches child fields
+ ArrayData::try_new(
+ DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]),
+ 3,
+ None,
+ None,
+ 0,
+ vec![],
+ vec![field1.data().clone()],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)"
+ )]
+ fn test_validate_struct_child_length() {
+ // field length only has 4 items, but array claims to have 6
+ let field1 = vec![Some(1), Some(2), Some(3), None]
+ .into_iter()
+ .collect::<Int32Array>();
+
+ ArrayData::try_new(
+ DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]),
+ 6,
+ None,
+ None,
+ 0,
+ vec![],
+ vec![field1.data().clone()],
+ )
+ .unwrap();
+ }
+
+ /// returns a buffer initialized with some constant value for tests
+ fn make_i32_buffer(n: usize) -> Buffer {
+ Buffer::from_slice_ref(&vec![42i32; n])
+ }
+
+ /// returns a buffer initialized with some constant value for tests
+ fn make_f32_buffer(n: usize) -> Buffer {
+ Buffer::from_slice_ref(&vec![42f32; n])
+ }
}
diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs
index cd5d4a0..13fcf92 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -2153,6 +2153,7 @@ mod tests {
assert_eq!(4, values.null_count());
let u16arr = values.as_any().downcast_ref::<UInt16Array>().unwrap();
+ // expect 4 nulls: negative numbers and overflow
let expected: UInt16Array =
vec![Some(0), Some(0), Some(0), None, None, None, Some(2), None]
.into_iter()
diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs
index f4ddbaf..e4808e2 100644
--- a/arrow/src/compute/util.rs
+++ b/arrow/src/compute/util.rs
@@ -184,7 +184,8 @@ pub(super) mod tests {
offset: usize,
null_bit_buffer: Option<Buffer>,
) -> Arc<ArrayData> {
- // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap
+ let buffer = Buffer::from(&vec![11; len]);
+
Arc::new(
ArrayData::try_new(
DataType::UInt8,
@@ -192,7 +193,7 @@ pub(super) mod tests {
None,
null_bit_buffer,
offset,
- vec![],
+ vec![buffer],
vec![],
)
.unwrap(),
diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
index 1cbec34..96fb18b 100644
--- a/arrow/src/datatypes/datatype.rs
+++ b/arrow/src/datatypes/datatype.rs
@@ -477,6 +477,16 @@ impl DataType {
)
}
+ /// Returns true if this type is valid as a dictionary key
+ /// (e.g. [`super::ArrowDictionaryKeyType`]
+ pub fn is_dictionary_key_type(t: &DataType) -> bool {
+ use DataType::*;
+ matches!(
+ t,
+ UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
+ )
+ }
+
/// Compares the datatype with another, ignoring nested field names
/// and metadata.
pub(crate) fn equals_datatype(&self, other: &DataType) -> bool {
diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs
index e925e2a..82cd101 100644
--- a/arrow/src/ipc/reader.rs
+++ b/arrow/src/ipc/reader.rs
@@ -969,13 +969,28 @@ mod tests {
}
#[test]
+ #[should_panic(
+ expected = "Last offset 687865856 of Utf8 is larger than values length 41"
+ )]
+ fn read_dictionary_be_not_implemented() {
+ // The offsets are not translated for big-endian files
+ // https://github.com/apache/arrow-rs/issues/859
+ let testdata = crate::util::test_util::arrow_test_data();
+ let file = File::open(format!(
+ "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_dictionary.arrow_file",
+ testdata
+ ))
+ .unwrap();
+ FileReader::try_new(file).unwrap();
+ }
+
+ #[test]
fn read_generated_be_files_should_work() {
// complementary to the previous test
let testdata = crate::util::test_util::arrow_test_data();
let paths = vec![
"generated_interval",
"generated_datetime",
- "generated_dictionary",
"generated_map",
"generated_nested",
"generated_null_trivial",