You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/11/09 11:47:53 UTC
[arrow-rs] 01/01: Validate arguments to ArrayData::new and null bit buffer and buffers (#810)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch cherry_pick_74b520c4
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit 542103a4604b3f87404c91176b1a0e800eeebd04
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Nov 8 14:11:10 2021 -0500

    Validate arguments to ArrayData::new and null bit buffer and buffers (#810)
    
    * Validate arguments to ArrayData::new: null bit buffer and buffers
    
    * REname is_int_type to is_dictionary_key_type()
    
    * Correctly handle self.offset in offsets buffer
    
    * Consolidate checks
    
    * Fix test output
---
 arrow/src/array/array_binary.rs    |  41 +-
 arrow/src/array/array_boolean.rs   |  11 +-
 arrow/src/array/array_list.rs      |  84 ++--
 arrow/src/array/array_map.rs       |   4 +-
 arrow/src/array/array_primitive.rs |  14 +-
 arrow/src/array/array_union.rs     |   5 +-
 arrow/src/array/data.rs            | 916 +++++++++++++++++++++++++++++++++++--
 arrow/src/compute/kernels/cast.rs  |   1 +
 arrow/src/compute/util.rs          |   5 +-
 arrow/src/datatypes/datatype.rs    |  10 +
 arrow/src/ipc/reader.rs            |  17 +-
 11 files changed, 1004 insertions(+), 104 deletions(-)

diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index 89a3efd..51743d8 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -891,10 +891,18 @@ mod tests {
             assert!(binary_array.is_valid(i));
             assert!(!binary_array.is_null(i));
         }
+    }
+
+    #[test]
+    fn test_binary_array_with_offsets() {
+        let values: [u8; 12] = [
+            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
+        ];
+        let offsets: [i32; 4] = [0, 5, 5, 12];
 
         // Test binary array with offset
         let array_data = ArrayData::builder(DataType::Binary)
-            .len(4)
+            .len(2)
             .offset(1)
             .add_buffer(Buffer::from_slice_ref(&offsets))
             .add_buffer(Buffer::from_slice_ref(&values))
@@ -947,10 +955,18 @@ mod tests {
             assert!(binary_array.is_valid(i));
             assert!(!binary_array.is_null(i));
         }
+    }
+
+    #[test]
+    fn test_large_binary_array_with_offsets() {
+        let values: [u8; 12] = [
+            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
+        ];
+        let offsets: [i64; 4] = [0, 5, 5, 12];
 
         // Test binary array with offset
         let array_data = ArrayData::builder(DataType::LargeBinary)
-            .len(4)
+            .len(2)
             .offset(1)
             .add_buffer(Buffer::from_slice_ref(&offsets))
             .add_buffer(Buffer::from_slice_ref(&values))
@@ -1196,26 +1212,25 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "FixedSizeBinaryArray can only be created from list array of u8 values \
-                    (i.e. FixedSizeList<PrimitiveArray<u8>>)."
+        expected = "FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays"
     )]
     fn test_fixed_size_binary_array_from_incorrect_list_array() {
         let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
         let values_data = ArrayData::builder(DataType::UInt32)
             .len(12)
             .add_buffer(Buffer::from_slice_ref(&values))
-            .add_child_data(ArrayData::builder(DataType::Boolean).build().unwrap())
             .build()
             .unwrap();
 
-        let array_data = ArrayData::builder(DataType::FixedSizeList(
-            Box::new(Field::new("item", DataType::Binary, false)),
-            4,
-        ))
-        .len(3)
-        .add_child_data(values_data)
-        .build()
-        .unwrap();
+        let array_data = unsafe {
+            ArrayData::builder(DataType::FixedSizeList(
+                Box::new(Field::new("item", DataType::Binary, false)),
+                4,
+            ))
+            .len(3)
+            .add_child_data(values_data)
+            .build_unchecked()
+        };
         let list_array = FixedSizeListArray::from(array_data);
         FixedSizeBinaryArray::from(list_array);
     }
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index 07f3da6..3b19594 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -332,10 +332,11 @@ mod tests {
     #[should_panic(expected = "BooleanArray data should contain a single buffer only \
                                (values buffer)")]
     fn test_boolean_array_invalid_buffer_len() {
-        let data = ArrayData::builder(DataType::Boolean)
-            .len(5)
-            .build()
-            .unwrap();
-        BooleanArray::from(data);
+        let data = unsafe {
+            ArrayData::builder(DataType::Boolean)
+                .len(5)
+                .build_unchecked()
+        };
+        drop(BooleanArray::from(data));
     }
 }
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index fbba8fc..2d2c62a 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -552,9 +552,10 @@ mod tests {
             assert!(!list_array.is_null(i));
         }
 
-        // Now test with a non-zero offset
+        // Now test with a non-zero offset (skip first element)
+        //  [[3, 4, 5], [6, 7]]
         let list_data = ArrayData::builder(list_data_type)
-            .len(3)
+            .len(2)
             .offset(1)
             .add_buffer(value_offsets)
             .add_child_data(value_data.clone())
@@ -565,7 +566,7 @@ mod tests {
         let values = list_array.values();
         assert_eq!(&value_data, values.data());
         assert_eq!(DataType::Int32, list_array.value_type());
-        assert_eq!(3, list_array.len());
+        assert_eq!(2, list_array.len());
         assert_eq!(0, list_array.null_count());
         assert_eq!(6, list_array.value_offsets()[1]);
         assert_eq!(2, list_array.value_length(1));
@@ -642,8 +643,9 @@ mod tests {
         }
 
         // Now test with a non-zero offset
+        //  [[3, 4, 5], [6, 7]]
         let list_data = ArrayData::builder(list_data_type)
-            .len(3)
+            .len(2)
             .offset(1)
             .add_buffer(value_offsets)
             .add_child_data(value_data.clone())
@@ -654,7 +656,7 @@ mod tests {
         let values = list_array.values();
         assert_eq!(&value_data, values.data());
         assert_eq!(DataType::Int32, list_array.value_type());
-        assert_eq!(3, list_array.len());
+        assert_eq!(2, list_array.len());
         assert_eq!(0, list_array.null_count());
         assert_eq!(6, list_array.value_offsets()[1]);
         assert_eq!(2, list_array.value_length(1));
@@ -763,12 +765,13 @@ mod tests {
             Box::new(Field::new("item", DataType::Int32, false)),
             3,
         );
-        let list_data = ArrayData::builder(list_data_type)
-            .len(3)
-            .add_child_data(value_data)
-            .build()
-            .unwrap();
-        FixedSizeListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .len(3)
+                .add_child_data(value_data)
+                .build_unchecked()
+        };
+        drop(FixedSizeListArray::from(list_data));
     }
 
     #[test]
@@ -1038,19 +1041,21 @@ mod tests {
         expected = "ListArray data should contain a single buffer only (value offsets)"
     )]
     fn test_list_array_invalid_buffer_len() {
-        let value_data = ArrayData::builder(DataType::Int32)
-            .len(8)
-            .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]))
-            .build()
-            .unwrap();
+        let value_data = unsafe {
+            ArrayData::builder(DataType::Int32)
+                .len(8)
+                .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]))
+                .build_unchecked()
+        };
         let list_data_type =
             DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
-        let list_data = ArrayData::builder(list_data_type)
-            .len(3)
-            .add_child_data(value_data)
-            .build()
-            .unwrap();
-        ListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .len(3)
+                .add_child_data(value_data)
+                .build_unchecked()
+        };
+        drop(ListArray::from(list_data));
     }
 
     #[test]
@@ -1061,12 +1066,13 @@ mod tests {
         let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]);
         let list_data_type =
             DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
-        let list_data = ArrayData::builder(list_data_type)
-            .len(3)
-            .add_buffer(value_offsets)
-            .build()
-            .unwrap();
-        ListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .len(3)
+                .add_buffer(value_offsets)
+                .build_unchecked()
+        };
+        drop(ListArray::from(list_data));
     }
 
     #[test]
@@ -1112,19 +1118,21 @@ mod tests {
         let buf2 = buf.slice(1);
 
         let values: [i32; 8] = [0; 8];
-        let value_data = ArrayData::builder(DataType::Int32)
-            .add_buffer(Buffer::from_slice_ref(&values))
-            .build()
-            .unwrap();
+        let value_data = unsafe {
+            ArrayData::builder(DataType::Int32)
+                .add_buffer(Buffer::from_slice_ref(&values))
+                .build_unchecked()
+        };
 
         let list_data_type =
             DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
-        let list_data = ArrayData::builder(list_data_type)
-            .add_buffer(buf2)
-            .add_child_data(value_data)
-            .build()
-            .unwrap();
-        ListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .add_buffer(buf2)
+                .add_child_data(value_data)
+                .build_unchecked()
+        };
+        drop(ListArray::from(list_data));
     }
 
     #[test]
diff --git a/arrow/src/array/array_map.rs b/arrow/src/array/array_map.rs
index bd888ff..caccebc 100644
--- a/arrow/src/array/array_map.rs
+++ b/arrow/src/array/array_map.rs
@@ -320,7 +320,7 @@ mod tests {
 
         // Now test with a non-zero offset
         let map_data = ArrayData::builder(map_array.data_type().clone())
-            .len(3)
+            .len(2)
             .offset(1)
             .add_buffer(map_array.data().buffers()[0].clone())
             .add_child_data(map_array.data().child_data()[0].clone())
@@ -331,7 +331,7 @@ mod tests {
         let values = map_array.values();
         assert_eq!(&value_data, values.data());
         assert_eq!(DataType::UInt32, map_array.value_type());
-        assert_eq!(3, map_array.len());
+        assert_eq!(2, map_array.len());
         assert_eq!(0, map_array.null_count());
         assert_eq!(6, map_array.value_offsets()[1]);
         assert_eq!(2, map_array.value_length(1));
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index a93e703..ac56b4a 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -894,7 +894,7 @@ mod tests {
     #[test]
     fn test_primitive_array_builder() {
         // Test building a primitive array with ArrayData builder and offset
-        let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]);
+        let buf = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4, 5, 6]);
         let buf2 = buf.clone();
         let data = ArrayData::builder(DataType::Int32)
             .len(5)
@@ -950,8 +950,16 @@ mod tests {
     #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \
                                (values buffer)")]
     fn test_primitive_array_invalid_buffer_len() {
-        let data = ArrayData::builder(DataType::Int32).len(5).build().unwrap();
-        Int32Array::from(data);
+        let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]);
+        let data = unsafe {
+            ArrayData::builder(DataType::Int32)
+                .add_buffer(buffer.clone())
+                .add_buffer(buffer)
+                .len(5)
+                .build_unchecked()
+        };
+
+        drop(Int32Array::from(data));
     }
 
     #[test]
diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs
index 6460e07..56efcfb 100644
--- a/arrow/src/array/array_union.rs
+++ b/arrow/src/array/array_union.rs
@@ -137,7 +137,10 @@ impl UnionArray {
             }
         }
 
-        Ok(Self::new(type_ids, value_offsets, child_arrays, bitmap))
+        let new_self = Self::new(type_ids, value_offsets, child_arrays, bitmap);
+        new_self.data().validate()?;
+
+        Ok(new_self)
     }
 
     /// Accesses the child array for `type_id`.
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index dbc5434..3ca14d8 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -18,11 +18,12 @@
 //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates
 //! common attributes and operations for Arrow array.
 
+use std::convert::TryInto;
 use std::mem;
 use std::sync::Arc;
 
 use crate::datatypes::{DataType, IntervalUnit};
-use crate::error::Result;
+use crate::error::{ArrowError, Result};
 use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};
 use crate::{
     buffer::{Buffer, MutableBuffer},
@@ -189,6 +190,28 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
     }
 }
 
+/// Ensures that at least `min_size` elements of type `data_type` can
+/// be stored in a buffer of `buffer_size`.
+///
+/// `buffer_index` is used in error messages to identify which buffer
+/// had the invalid index
+fn ensure_size(
+    data_type: &DataType,
+    min_size: usize,
+    buffer_size: usize,
+    buffer_index: usize,
+) -> Result<()> {
+    // if min_size is zero, may not have buffers (e.g. NullArray)
+    if min_size > 0 && buffer_size < min_size {
+        Err(ArrowError::InvalidArgumentError(format!(
+            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
+            buffer_size, buffer_index, data_type, min_size
+        )))
+    } else {
+        Ok(())
+    }
+}
+
 /// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`.
 #[inline]
 pub(crate) fn into_buffers(
@@ -313,18 +336,6 @@ impl ArrayData {
         Ok(new_self)
     }
 
-    /// Validates that buffers in this ArrayData are sufficiently
-    /// sized, to store `len` + `offset` total elements of
-    /// `data_type`.
-    ///
-    /// This check is "cheap" in the sense that it does not validate the
-    /// contents of the buffers (e.g. that string offsets for UTF8 arrays
-    /// are within the length of the buffer).
-    pub fn validate(&self) -> Result<()> {
-        // will be filled in a subsequent PR
-        Ok(())
-    }
-
     /// Returns a builder to construct a `ArrayData` instance.
     #[inline]
     pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
@@ -559,6 +570,437 @@ impl ArrayData {
             )
         }
     }
+
+    /// "cheap" validation of an `ArrayData`. Ensures buffers are
+    /// sufficiently sized to store `len` + `offset` total elements of
+    /// `data_type` and performs other inexpensive consistency checks.
+    ///
+    /// This check is "cheap" in the sense that it does not validate the
+    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+    /// are within the bounds of the values buffer).
+    ///
+    /// TODO: add a validate_full that validates the offsets and valid utf8 data
+    pub fn validate(&self) -> Result<()> {
+        // Need at least this mich space in each buffer
+        let len_plus_offset = self.len + self.offset;
+
+        // Check that the data layout conforms to the spec
+        let layout = layout(&self.data_type);
+
+        // Will validate Union when conforms to new spec:
+        // https://github.com/apache/arrow-rs/issues/85
+        if matches!(&self.data_type, DataType::Union(_)) {
+            return Ok(());
+        }
+        if self.buffers.len() != layout.buffers.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Expected {} buffers in array of type {:?}, got {}",
+                layout.buffers.len(),
+                self.data_type,
+                self.buffers.len(),
+            )));
+        }
+
+        for (i, (buffer, spec)) in
+            self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+        {
+            match spec {
+                BufferSpec::FixedWidth { byte_width } => {
+                    let min_buffer_size = len_plus_offset
+                        .checked_mul(*byte_width)
+                        .expect("integer overflow computing min buffer size");
+
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::VariableWidth => {
+                    // not cheap to validate (need to look at the
+                    // data). Partially checked in validate_offsets
+                    // called below. Can check with `validate_full`
+                }
+                BufferSpec::BitMap => {
+                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::AlwaysNull => {
+                    // Nothing to validate
+                }
+            }
+        }
+
+        if self.null_count > self.len {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "null_count {} for an array exceeds length of {} elements",
+                self.null_count, self.len
+            )));
+        }
+
+        // check null bit buffer size
+        if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
+            if null_bit_buffer.len() < needed_len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "null_bit_buffer size too small. got {} needed {}",
+                    null_bit_buffer.len(),
+                    needed_len
+                )));
+            }
+        } else if self.null_count > 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Array of type {} has {} nulls but no null bitmap",
+                self.data_type, self.null_count
+            )));
+        }
+
+        self.validate_child_data()?;
+
+        // Additional Type specific checks
+        match &self.data_type {
+            DataType::Utf8 | DataType::Binary => {
+                self.validate_offsets::<i32>(&self.buffers[0], self.buffers[1].len())?;
+            }
+            DataType::LargeUtf8 | DataType::LargeBinary => {
+                self.validate_offsets::<i64>(&self.buffers[0], self.buffers[1].len())?;
+            }
+            DataType::Dictionary(key_type, _value_type) => {
+                // At the moment, constructing a DictionaryArray will also check this
+                if !DataType::is_dictionary_key_type(key_type) {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Dictionary values must be integer, but was {}",
+                        key_type
+                    )));
+                }
+            }
+            _ => {}
+        };
+
+        Ok(())
+    }
+
+    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
+    /// offsets (of type T> into some other buffer of `values_length` bytes long
+    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+        &self,
+        buffer: &Buffer,
+        values_length: usize,
+    ) -> Result<()> {
+        // Validate that there are the correct number of offsets for this array's length
+        let required_offsets = self.len + self.offset + 1;
+
+        // An empty list-like array can have 0 offsets
+        if buffer.is_empty() {
+            return Ok(());
+        }
+
+        if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}",
+                buffer.len(), self.data_type, self.len, required_offsets
+            )));
+        }
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { &(buffer.typed_data::<T>()[self.offset..]) };
+
+        let first_offset = offsets[0].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[0] ({}) to usize for {}",
+                offsets[0], self.data_type
+            ))
+        })?;
+
+        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[{}] ({}) to usize for {}",
+                self.len, offsets[self.len], self.data_type
+            ))
+        })?;
+
+        if first_offset > values_length {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "First offset {} of {} is larger than values length {}",
+                first_offset, self.data_type, values_length,
+            )));
+        }
+
+        if last_offset > values_length {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last offset {} of {} is larger than values length {}",
+                last_offset, self.data_type, values_length,
+            )));
+        }
+
+        if first_offset > last_offset {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "First offset {} in {} is smaller than last offset {}",
+                first_offset, self.data_type, last_offset,
+            )));
+        }
+
+        Ok(())
+    }
+
+    /// Validates the layout of `child_data` ArrayData structures
+    fn validate_child_data(&self) -> Result<()> {
+        match &self.data_type {
+            DataType::List(field) | DataType::Map(field, _) => {
+                let values_data = self.get_single_valid_child_data(field.data_type())?;
+                self.validate_offsets::<i32>(&self.buffers[0], values_data.len)?;
+                Ok(())
+            }
+            DataType::LargeList(field) => {
+                let values_data = self.get_single_valid_child_data(field.data_type())?;
+                self.validate_offsets::<i64>(&self.buffers[0], values_data.len)?;
+                Ok(())
+            }
+            DataType::FixedSizeList(field, list_size) => {
+                let values_data = self.get_single_valid_child_data(field.data_type())?;
+
+                let list_size: usize = (*list_size).try_into().map_err(|_| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "{} has a negative list_size {}",
+                        self.data_type, list_size
+                    ))
+                })?;
+
+                let expected_values_len = self.len
+                    .checked_mul(list_size)
+                    .expect("integer overflow computing expected number of expected values in FixedListSize");
+
+                if values_data.len < expected_values_len {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
+                        values_data.len, list_size, list_size, self.data_type
+                    )));
+                }
+
+                Ok(())
+            }
+            DataType::Struct(fields) => {
+                self.validate_num_child_data(fields.len())?;
+                for (i, field) in fields.iter().enumerate() {
+                    let field_data = self.get_valid_child_data(i, field.data_type())?;
+
+                    // C++ does this check, but it is not clear why
+                    // field_data checks only len, but self checks len+offset
+                    if field_data.len < (self.len + self.offset) {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
+                            self.data_type, i, field.name(), field_data.len, self.len + self.offset
+                        )));
+                    }
+                }
+                Ok(())
+            }
+            DataType::Union(_fields) => {
+                // Validate Union Array as part of implementing new Union semantics
+                // See comments in `ArrayData::validate()`
+                // https://github.com/apache/arrow-rs/issues/85
+                Ok(())
+            }
+            DataType::Dictionary(_key_type, value_type) => {
+                self.get_single_valid_child_data(value_type)?;
+                Ok(())
+            }
+            _ => {
+                // other types do not have child data
+                if !self.child_data.is_empty() {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Expected no child arrays for type {} but got {}",
+                        self.data_type,
+                        self.child_data.len()
+                    )));
+                }
+                Ok(())
+            }
+        }
+    }
+
+    /// Ensures that this array data has a single child_data with the
+    /// expected type, and calls `validate()` on it. Returns a
+    /// reference to that child_data
+    fn get_single_valid_child_data(
+        &self,
+        expected_type: &DataType,
+    ) -> Result<&ArrayData> {
+        self.validate_num_child_data(1)?;
+        self.get_valid_child_data(0, expected_type)
+    }
+
+    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
+    fn validate_num_child_data(&self, expected_len: usize) -> Result<()> {
+        if self.child_data().len() != expected_len {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "Value data for {} should contain {} child data array(s), had {}",
+                self.data_type(),
+                expected_len,
+                self.child_data.len()
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Ensures that `child_data[i]` has the expected type, calls
+    /// `validate()` on it, and returns a reference to that child_data
+    fn get_valid_child_data(
+        &self,
+        i: usize,
+        expected_type: &DataType,
+    ) -> Result<&ArrayData> {
+        let values_data = self.child_data
+            .get(i)
+            .ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "{} did not have enough child arrays. Expected at least {} but had only {}",
+                    self.data_type, i+1, self.child_data.len()
+                ))
+            })?;
+
+        if expected_type != &values_data.data_type {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Child type mismatch for {}. Expected {} but child data had {}",
+                self.data_type, expected_type, values_data.data_type
+            )));
+        }
+
+        values_data.validate()?;
+        Ok(values_data)
+    }
+}
+
+/// Return the expected [`DataTypeLayout`] Arrays of this data
+/// type are expected to have
+fn layout(data_type: &DataType) -> DataTypeLayout {
+    // based on C/C++ implementation in
+    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
+    use std::mem::size_of;
+    match data_type {
+        DataType::Null => DataTypeLayout::new_empty(),
+        DataType::Boolean => DataTypeLayout {
+            buffers: vec![BufferSpec::BitMap],
+        },
+        DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::<i8>()),
+        DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::<i16>()),
+        DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::<u8>()),
+        DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::<u16>()),
+        DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::<u32>()),
+        DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::<u64>()),
+        DataType::Float16 => unimplemented!(),
+        DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::<f32>()),
+        DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::<f64>()),
+        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Time32(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Time64(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Interval(IntervalUnit::YearMonth) => {
+            DataTypeLayout::new_fixed_width(size_of::<i32>())
+        }
+        DataType::Interval(IntervalUnit::DayTime) => {
+            DataTypeLayout::new_fixed_width(size_of::<i64>())
+        }
+        DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()),
+        DataType::FixedSizeBinary(bytes_per_value) => {
+            let bytes_per_value: usize = (*bytes_per_value)
+                .try_into()
+                .expect("negative size for fixed size binary");
+            DataTypeLayout::new_fixed_width(bytes_per_value)
+        }
+        DataType::LargeBinary => DataTypeLayout::new_binary(size_of::<i64>()),
+        DataType::Utf8 => DataTypeLayout::new_binary(size_of::<i32>()),
+        DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::<i64>()),
+        DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data
+        DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data,
+        DataType::Union(_) => {
+            DataTypeLayout::new_fixed_width(size_of::<u8>())
+            // Note sparse unions only have one buffer (u8) type_ids,
+            // and dense unions have 2 (type_ids as well as offsets).
+            // https://github.com/apache/arrow-rs/issues/85
+        }
+        DataType::Dictionary(key_type, _value_type) => layout(key_type),
+        DataType::Decimal(_, _) => {
+            // Decimals are always some fixed width; The rust implemenation
+            // always uses 16 bytes / size of i128
+            DataTypeLayout::new_fixed_width(size_of::<i128>())
+        }
+        DataType::Map(_, _) => {
+            // same as ListType
+            DataTypeLayout::new_fixed_width(size_of::<i32>())
+        }
+    }
+}
+
+/// Layout specification for a data type
+#[derive(Debug, PartialEq)]
+// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
+struct DataTypeLayout {
+    /// A vector of buffer layout specifications, one for each expected buffer
+    pub buffers: Vec<BufferSpec>,
+}
+
+impl DataTypeLayout {
+    /// Describes a basic numeric array where each element has a fixed width
+    pub fn new_fixed_width(byte_width: usize) -> Self {
+        Self {
+            buffers: vec![BufferSpec::FixedWidth { byte_width }],
+        }
+    }
+
+    /// Describes arrays which have no data of their own
+    /// (e.g. FixedSizeList). Note such arrays may still have a Null
+    /// Bitmap
+    pub fn new_empty() -> Self {
+        Self { buffers: vec![] }
+    }
+
+    /// Describes a basic numeric array where each element has a fixed
+    /// with offset buffer of `offset_byte_width` bytes, followed by a
+    /// variable width data buffer
+    pub fn new_binary(offset_byte_width: usize) -> Self {
+        Self {
+            buffers: vec![
+                // offsets
+                BufferSpec::FixedWidth {
+                    byte_width: offset_byte_width,
+                },
+                // values
+                BufferSpec::VariableWidth,
+            ],
+        }
+    }
+}
+
+/// Layout specification for a single data type buffer
+#[derive(Debug, PartialEq)]
+enum BufferSpec {
+    /// each element has a fixed width
+    FixedWidth { byte_width: usize },
+    /// Variable width, such as string data for utf8 data
+    VariableWidth,
+    /// Buffer holds a bitmap.
+    ///
+    /// Note: Unlike the C++ implementation, the null/validity buffer
+    /// is handled specially rather than as another of the buffers in
+    /// the spec, so this variant is only used for the Boolean type.
+    BitMap,
+    /// Buffer is always null. Unused currently in Rust implementation,
+    /// (used in C++ for Union type)
+    AlwaysNull,
 }
 
 impl PartialEq for ArrayData {
@@ -672,23 +1114,38 @@ impl ArrayDataBuilder {
 mod tests {
     use super::*;
 
+    use crate::array::{Array, Int32Array, StringArray};
     use crate::buffer::Buffer;
+    use crate::datatypes::Field;
     use crate::util::bit_util;
 
     #[test]
-    fn test_new() {
-        let arr_data =
-            ArrayData::try_new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![])
-                .unwrap();
-        assert_eq!(10, arr_data.len());
-        assert_eq!(1, arr_data.null_count());
-        assert_eq!(2, arr_data.offset());
-        assert_eq!(0, arr_data.buffers().len());
-        assert_eq!(0, arr_data.child_data().len());
+    fn test_builder() {
+        // Buffer needs to be at least 25 long
+        let v = (0..25).collect::<Vec<i32>>();
+        let b1 = Buffer::from_slice_ref(&v);
+        let arr_data = ArrayData::builder(DataType::Int32)
+            .len(20)
+            .offset(5)
+            .add_buffer(b1)
+            .null_bit_buffer(Buffer::from(vec![
+                0b01011111, 0b10110101, 0b01100011, 0b00011110,
+            ]))
+            .build()
+            .unwrap();
+
+        assert_eq!(20, arr_data.len());
+        assert_eq!(10, arr_data.null_count());
+        assert_eq!(5, arr_data.offset());
+        assert_eq!(1, arr_data.buffers().len());
+        assert_eq!(
+            Buffer::from_slice_ref(&v).as_slice(),
+            arr_data.buffers()[0].as_slice()
+        );
     }
 
     #[test]
-    fn test_builder() {
+    fn test_builder_with_child_data() {
         let child_arr_data = ArrayData::try_new(
             DataType::Int32,
             5,
@@ -699,24 +1156,17 @@ mod tests {
             vec![],
         )
         .unwrap();
-        let v = vec![0, 1, 2, 3];
-        let b1 = Buffer::from(&v[..]);
-        let arr_data = ArrayData::builder(DataType::Int32)
-            .len(20)
-            .offset(5)
-            .add_buffer(b1)
-            .null_bit_buffer(Buffer::from(vec![
-                0b01011111, 0b10110101, 0b01100011, 0b00011110,
-            ]))
+
+        let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]);
+
+        let arr_data = ArrayData::builder(data_type)
+            .len(5)
+            .offset(0)
             .add_child_data(child_arr_data.clone())
             .build()
             .unwrap();
 
-        assert_eq!(20, arr_data.len());
-        assert_eq!(10, arr_data.null_count());
-        assert_eq!(5, arr_data.offset());
-        assert_eq!(1, arr_data.buffers().len());
-        assert_eq!(&[0, 1, 2, 3], arr_data.buffers()[0].as_slice());
+        assert_eq!(5, arr_data.len());
         assert_eq!(1, arr_data.child_data().len());
         assert_eq!(child_arr_data, arr_data.child_data()[0]);
     }
@@ -729,6 +1179,7 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 10);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(16)
+            .add_buffer(make_i32_buffer(16))
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -742,6 +1193,7 @@ mod tests {
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(12)
             .offset(2)
+            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -756,6 +1208,7 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 10);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(16)
+            .add_buffer(make_i32_buffer(16))
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -771,6 +1224,7 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 10);
         let data = ArrayData::builder(DataType::Int32)
             .len(16)
+            .add_buffer(make_i32_buffer(16))
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -788,8 +1242,16 @@ mod tests {
 
     #[test]
     fn test_equality() {
-        let int_data = ArrayData::builder(DataType::Int32).build().unwrap();
-        let float_data = ArrayData::builder(DataType::Float32).build().unwrap();
+        let int_data = ArrayData::builder(DataType::Int32)
+            .len(1)
+            .add_buffer(make_i32_buffer(1))
+            .build()
+            .unwrap();
+        let float_data = ArrayData::builder(DataType::Float32)
+            .len(1)
+            .add_buffer(make_f32_buffer(1))
+            .build()
+            .unwrap();
         assert_ne!(int_data, float_data);
     }
 
@@ -802,4 +1264,380 @@ mod tests {
         let count = count_nulls(null_buffer.as_ref(), 4, 8);
         assert_eq!(count, 3);
     }
+
+    #[test]
+    #[should_panic(
+        expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8"
+    )]
+    fn test_buffer_too_small() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8)
+        ArrayData::try_new(DataType::Int64, 10, Some(0), None, 0, vec![buffer], vec![])
+            .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8"
+    )]
+    fn test_buffer_too_small_offset() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        // should fail -- size is ok, but also has offset
+        ArrayData::try_new(DataType::Int64, 1, Some(0), None, 1, vec![buffer], vec![])
+            .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")]
+    fn test_bad_number_of_buffers() {
+        let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]);
+        let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(
+            DataType::Int64,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![buffer1, buffer2],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "integer overflow computing min buffer size")]
+    fn test_fixed_width_overflow() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(
+            DataType::Int64,
+            usize::MAX,
+            Some(0),
+            None,
+            0,
+            vec![buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "null_bit_buffer size too small. got 8 needed 13")]
+    fn test_bitmap_too_small() {
+        let buffer = make_i32_buffer(100);
+        let null_bit_buffer = Buffer::from(vec![0b11111111]);
+
+        ArrayData::try_new(
+            DataType::Int32,
+            100,
+            Some(0),
+            Some(null_bit_buffer),
+            0,
+            vec![buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "null_count 3 for an array exceeds length of 2 elements")]
+    fn test_bad_null_count() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(DataType::Int32, 2, Some(3), None, 0, vec![buffer], vec![])
+            .unwrap();
+    }
+
+    // Test creating a dictionary with a non integer type
+    #[test]
+    #[should_panic(expected = "Dictionary values must be integer, but was Utf8")]
+    fn test_non_int_dictionary() {
+        let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        let data_type =
+            DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32));
+        let child_data = ArrayData::try_new(
+            DataType::Int32,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![i32_buffer.clone()],
+            vec![],
+        )
+        .unwrap();
+        ArrayData::try_new(
+            data_type,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![i32_buffer.clone(), i32_buffer],
+            vec![child_data],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")]
+    fn test_mismatched_dictionary_types() {
+        // test w/ dictionary created with a child array data that has type different than declared
+        let string_array: StringArray =
+            vec![Some("foo"), Some("bar")].into_iter().collect();
+        let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]);
+        // Dict says LargeUtf8 but array is Utf8
+        let data_type = DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::LargeUtf8),
+        );
+        let child_data = string_array.data().clone();
+        ArrayData::try_new(
+            data_type,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![i32_buffer],
+            vec![child_data],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3"
+    )]
+    fn test_validate_offsets_i32() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offsets buffer size (bytes): 16 isn't large enough for LargeUtf8. Length 2 needs 3"
+    )]
+    fn test_validate_offsets_i64() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]);
+        ArrayData::try_new(
+            DataType::LargeUtf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")]
+    fn test_validate_offsets_negative_first_i32() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")]
+    fn test_validate_offsets_negative_last_i32() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")]
+    fn test_validate_offsets_range_too_small() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // start offset is larger than end
+        let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")]
+    fn test_validate_offsets_range_too_large() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")]
+    fn test_validate_offsets_first_too_large() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer
+        let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_validate_offsets_first_too_large_skipped() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer, but offset starts at 1 so it is skipped
+        let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]);
+        let data = ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            1,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+        let array: StringArray = data.into();
+        let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect();
+        assert_eq!(array, expected);
+    }
+
+    #[test]
+    #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")]
+    fn test_validate_offsets_last_too_large() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer
+        let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"
+    )]
+    fn test_validate_fixed_size_list() {
+        // child has 4 elements,
+        let child_array = vec![Some(1), Some(2), Some(3), None]
+            .into_iter()
+            .collect::<Int32Array>();
+
+        // but claim we have 3 elements for a fixed size of 2
+        // 10 is off the end of the buffer
+        let field = Field::new("field", DataType::Int32, true);
+        ArrayData::try_new(
+            DataType::FixedSizeList(Box::new(field), 2),
+            3,
+            None,
+            None,
+            0,
+            vec![],
+            vec![child_array.data().clone()],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Child type mismatch for Struct")]
+    fn test_validate_struct_child_type() {
+        let field1 = vec![Some(1), Some(2), Some(3), None]
+            .into_iter()
+            .collect::<Int32Array>();
+
+        // validate the the type of struct fields matches child fields
+        ArrayData::try_new(
+            DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]),
+            3,
+            None,
+            None,
+            0,
+            vec![],
+            vec![field1.data().clone()],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)"
+    )]
+    fn test_validate_struct_child_length() {
+        // field length only has 4 items, but array claims to have 6
+        let field1 = vec![Some(1), Some(2), Some(3), None]
+            .into_iter()
+            .collect::<Int32Array>();
+
+        ArrayData::try_new(
+            DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]),
+            6,
+            None,
+            None,
+            0,
+            vec![],
+            vec![field1.data().clone()],
+        )
+        .unwrap();
+    }
+
+    /// returns a buffer initialized with some constant value for tests
+    fn make_i32_buffer(n: usize) -> Buffer {
+        Buffer::from_slice_ref(&vec![42i32; n])
+    }
+
+    /// returns a buffer initialized with some constant value for tests
+    fn make_f32_buffer(n: usize) -> Buffer {
+        Buffer::from_slice_ref(&vec![42f32; n])
+    }
 }
diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs
index cd5d4a0..13fcf92 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -2153,6 +2153,7 @@ mod tests {
         assert_eq!(4, values.null_count());
         let u16arr = values.as_any().downcast_ref::<UInt16Array>().unwrap();
 
+        // expect 4 nulls: negative numbers and overflow
         let expected: UInt16Array =
             vec![Some(0), Some(0), Some(0), None, None, None, Some(2), None]
                 .into_iter()
diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs
index f4ddbaf..e4808e2 100644
--- a/arrow/src/compute/util.rs
+++ b/arrow/src/compute/util.rs
@@ -184,7 +184,8 @@ pub(super) mod tests {
         offset: usize,
         null_bit_buffer: Option<Buffer>,
     ) -> Arc<ArrayData> {
-        // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap
+        let buffer = Buffer::from(&vec![11; len]);
+
         Arc::new(
             ArrayData::try_new(
                 DataType::UInt8,
@@ -192,7 +193,7 @@ pub(super) mod tests {
                 None,
                 null_bit_buffer,
                 offset,
-                vec![],
+                vec![buffer],
                 vec![],
             )
             .unwrap(),
diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
index 1cbec34..96fb18b 100644
--- a/arrow/src/datatypes/datatype.rs
+++ b/arrow/src/datatypes/datatype.rs
@@ -477,6 +477,16 @@ impl DataType {
         )
     }
 
+    /// Returns true if this type is valid as a dictionary key
+    /// (e.g. [`super::ArrowDictionaryKeyType`]
+    pub fn is_dictionary_key_type(t: &DataType) -> bool {
+        use DataType::*;
+        matches!(
+            t,
+            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
+        )
+    }
+
     /// Compares the datatype with another, ignoring nested field names
     /// and metadata.
     pub(crate) fn equals_datatype(&self, other: &DataType) -> bool {
diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs
index e925e2a..82cd101 100644
--- a/arrow/src/ipc/reader.rs
+++ b/arrow/src/ipc/reader.rs
@@ -969,13 +969,28 @@ mod tests {
     }
 
     #[test]
+    #[should_panic(
+        expected = "Last offset 687865856 of Utf8 is larger than values length 41"
+    )]
+    fn read_dictionary_be_not_implemented() {
+        // The offsets are not translated for big-endian files
+        // https://github.com/apache/arrow-rs/issues/859
+        let testdata = crate::util::test_util::arrow_test_data();
+        let file = File::open(format!(
+                "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_dictionary.arrow_file",
+                testdata
+            ))
+            .unwrap();
+        FileReader::try_new(file).unwrap();
+    }
+
+    #[test]
     fn read_generated_be_files_should_work() {
         // complementary to the previous test
         let testdata = crate::util::test_util::arrow_test_data();
         let paths = vec![
             "generated_interval",
             "generated_datetime",
-            "generated_dictionary",
             "generated_map",
             "generated_nested",
             "generated_null_trivial",