You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/11/09 11:47:52 UTC

[arrow-rs] branch cherry_pick_74b520c4 created (now 542103a)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch cherry_pick_74b520c4
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git.


      at 542103a  Validate arguments to ArrayData::new and null bit buffer and buffers (#810)

This branch includes the following new commits:

     new 542103a  Validate arguments to ArrayData::new and null bit buffer and buffers (#810)

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[arrow-rs] 01/01: Validate arguments to ArrayData::new and null bit buffer and buffers (#810)

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch cherry_pick_74b520c4
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit 542103a4604b3f87404c91176b1a0e800eeebd04
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Nov 8 14:11:10 2021 -0500

    Validate arguments to ArrayData::new and null bit buffer and buffers (#810)
    
    * Validate arguments to ArrayData::new: null bit buffer and buffers
    
    * REname is_int_type to is_dictionary_key_type()
    
    * Correctly handle self.offset in offsets buffer
    
    * Consolidate checks
    
    * Fix test output
---
 arrow/src/array/array_binary.rs    |  41 +-
 arrow/src/array/array_boolean.rs   |  11 +-
 arrow/src/array/array_list.rs      |  84 ++--
 arrow/src/array/array_map.rs       |   4 +-
 arrow/src/array/array_primitive.rs |  14 +-
 arrow/src/array/array_union.rs     |   5 +-
 arrow/src/array/data.rs            | 916 +++++++++++++++++++++++++++++++++++--
 arrow/src/compute/kernels/cast.rs  |   1 +
 arrow/src/compute/util.rs          |   5 +-
 arrow/src/datatypes/datatype.rs    |  10 +
 arrow/src/ipc/reader.rs            |  17 +-
 11 files changed, 1004 insertions(+), 104 deletions(-)

diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index 89a3efd..51743d8 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -891,10 +891,18 @@ mod tests {
             assert!(binary_array.is_valid(i));
             assert!(!binary_array.is_null(i));
         }
+    }
+
+    #[test]
+    fn test_binary_array_with_offsets() {
+        let values: [u8; 12] = [
+            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
+        ];
+        let offsets: [i32; 4] = [0, 5, 5, 12];
 
         // Test binary array with offset
         let array_data = ArrayData::builder(DataType::Binary)
-            .len(4)
+            .len(2)
             .offset(1)
             .add_buffer(Buffer::from_slice_ref(&offsets))
             .add_buffer(Buffer::from_slice_ref(&values))
@@ -947,10 +955,18 @@ mod tests {
             assert!(binary_array.is_valid(i));
             assert!(!binary_array.is_null(i));
         }
+    }
+
+    #[test]
+    fn test_large_binary_array_with_offsets() {
+        let values: [u8; 12] = [
+            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
+        ];
+        let offsets: [i64; 4] = [0, 5, 5, 12];
 
         // Test binary array with offset
         let array_data = ArrayData::builder(DataType::LargeBinary)
-            .len(4)
+            .len(2)
             .offset(1)
             .add_buffer(Buffer::from_slice_ref(&offsets))
             .add_buffer(Buffer::from_slice_ref(&values))
@@ -1196,26 +1212,25 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "FixedSizeBinaryArray can only be created from list array of u8 values \
-                    (i.e. FixedSizeList<PrimitiveArray<u8>>)."
+        expected = "FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays"
     )]
     fn test_fixed_size_binary_array_from_incorrect_list_array() {
         let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
         let values_data = ArrayData::builder(DataType::UInt32)
             .len(12)
             .add_buffer(Buffer::from_slice_ref(&values))
-            .add_child_data(ArrayData::builder(DataType::Boolean).build().unwrap())
             .build()
             .unwrap();
 
-        let array_data = ArrayData::builder(DataType::FixedSizeList(
-            Box::new(Field::new("item", DataType::Binary, false)),
-            4,
-        ))
-        .len(3)
-        .add_child_data(values_data)
-        .build()
-        .unwrap();
+        let array_data = unsafe {
+            ArrayData::builder(DataType::FixedSizeList(
+                Box::new(Field::new("item", DataType::Binary, false)),
+                4,
+            ))
+            .len(3)
+            .add_child_data(values_data)
+            .build_unchecked()
+        };
         let list_array = FixedSizeListArray::from(array_data);
         FixedSizeBinaryArray::from(list_array);
     }
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index 07f3da6..3b19594 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -332,10 +332,11 @@ mod tests {
     #[should_panic(expected = "BooleanArray data should contain a single buffer only \
                                (values buffer)")]
     fn test_boolean_array_invalid_buffer_len() {
-        let data = ArrayData::builder(DataType::Boolean)
-            .len(5)
-            .build()
-            .unwrap();
-        BooleanArray::from(data);
+        let data = unsafe {
+            ArrayData::builder(DataType::Boolean)
+                .len(5)
+                .build_unchecked()
+        };
+        drop(BooleanArray::from(data));
     }
 }
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index fbba8fc..2d2c62a 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -552,9 +552,10 @@ mod tests {
             assert!(!list_array.is_null(i));
         }
 
-        // Now test with a non-zero offset
+        // Now test with a non-zero offset (skip first element)
+        //  [[3, 4, 5], [6, 7]]
         let list_data = ArrayData::builder(list_data_type)
-            .len(3)
+            .len(2)
             .offset(1)
             .add_buffer(value_offsets)
             .add_child_data(value_data.clone())
@@ -565,7 +566,7 @@ mod tests {
         let values = list_array.values();
         assert_eq!(&value_data, values.data());
         assert_eq!(DataType::Int32, list_array.value_type());
-        assert_eq!(3, list_array.len());
+        assert_eq!(2, list_array.len());
         assert_eq!(0, list_array.null_count());
         assert_eq!(6, list_array.value_offsets()[1]);
         assert_eq!(2, list_array.value_length(1));
@@ -642,8 +643,9 @@ mod tests {
         }
 
         // Now test with a non-zero offset
+        //  [[3, 4, 5], [6, 7]]
         let list_data = ArrayData::builder(list_data_type)
-            .len(3)
+            .len(2)
             .offset(1)
             .add_buffer(value_offsets)
             .add_child_data(value_data.clone())
@@ -654,7 +656,7 @@ mod tests {
         let values = list_array.values();
         assert_eq!(&value_data, values.data());
         assert_eq!(DataType::Int32, list_array.value_type());
-        assert_eq!(3, list_array.len());
+        assert_eq!(2, list_array.len());
         assert_eq!(0, list_array.null_count());
         assert_eq!(6, list_array.value_offsets()[1]);
         assert_eq!(2, list_array.value_length(1));
@@ -763,12 +765,13 @@ mod tests {
             Box::new(Field::new("item", DataType::Int32, false)),
             3,
         );
-        let list_data = ArrayData::builder(list_data_type)
-            .len(3)
-            .add_child_data(value_data)
-            .build()
-            .unwrap();
-        FixedSizeListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .len(3)
+                .add_child_data(value_data)
+                .build_unchecked()
+        };
+        drop(FixedSizeListArray::from(list_data));
     }
 
     #[test]
@@ -1038,19 +1041,21 @@ mod tests {
         expected = "ListArray data should contain a single buffer only (value offsets)"
     )]
     fn test_list_array_invalid_buffer_len() {
-        let value_data = ArrayData::builder(DataType::Int32)
-            .len(8)
-            .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]))
-            .build()
-            .unwrap();
+        let value_data = unsafe {
+            ArrayData::builder(DataType::Int32)
+                .len(8)
+                .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]))
+                .build_unchecked()
+        };
         let list_data_type =
             DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
-        let list_data = ArrayData::builder(list_data_type)
-            .len(3)
-            .add_child_data(value_data)
-            .build()
-            .unwrap();
-        ListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .len(3)
+                .add_child_data(value_data)
+                .build_unchecked()
+        };
+        drop(ListArray::from(list_data));
     }
 
     #[test]
@@ -1061,12 +1066,13 @@ mod tests {
         let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]);
         let list_data_type =
             DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
-        let list_data = ArrayData::builder(list_data_type)
-            .len(3)
-            .add_buffer(value_offsets)
-            .build()
-            .unwrap();
-        ListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .len(3)
+                .add_buffer(value_offsets)
+                .build_unchecked()
+        };
+        drop(ListArray::from(list_data));
     }
 
     #[test]
@@ -1112,19 +1118,21 @@ mod tests {
         let buf2 = buf.slice(1);
 
         let values: [i32; 8] = [0; 8];
-        let value_data = ArrayData::builder(DataType::Int32)
-            .add_buffer(Buffer::from_slice_ref(&values))
-            .build()
-            .unwrap();
+        let value_data = unsafe {
+            ArrayData::builder(DataType::Int32)
+                .add_buffer(Buffer::from_slice_ref(&values))
+                .build_unchecked()
+        };
 
         let list_data_type =
             DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
-        let list_data = ArrayData::builder(list_data_type)
-            .add_buffer(buf2)
-            .add_child_data(value_data)
-            .build()
-            .unwrap();
-        ListArray::from(list_data);
+        let list_data = unsafe {
+            ArrayData::builder(list_data_type)
+                .add_buffer(buf2)
+                .add_child_data(value_data)
+                .build_unchecked()
+        };
+        drop(ListArray::from(list_data));
     }
 
     #[test]
diff --git a/arrow/src/array/array_map.rs b/arrow/src/array/array_map.rs
index bd888ff..caccebc 100644
--- a/arrow/src/array/array_map.rs
+++ b/arrow/src/array/array_map.rs
@@ -320,7 +320,7 @@ mod tests {
 
         // Now test with a non-zero offset
         let map_data = ArrayData::builder(map_array.data_type().clone())
-            .len(3)
+            .len(2)
             .offset(1)
             .add_buffer(map_array.data().buffers()[0].clone())
             .add_child_data(map_array.data().child_data()[0].clone())
@@ -331,7 +331,7 @@ mod tests {
         let values = map_array.values();
         assert_eq!(&value_data, values.data());
         assert_eq!(DataType::UInt32, map_array.value_type());
-        assert_eq!(3, map_array.len());
+        assert_eq!(2, map_array.len());
         assert_eq!(0, map_array.null_count());
         assert_eq!(6, map_array.value_offsets()[1]);
         assert_eq!(2, map_array.value_length(1));
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index a93e703..ac56b4a 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -894,7 +894,7 @@ mod tests {
     #[test]
     fn test_primitive_array_builder() {
         // Test building a primitive array with ArrayData builder and offset
-        let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]);
+        let buf = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4, 5, 6]);
         let buf2 = buf.clone();
         let data = ArrayData::builder(DataType::Int32)
             .len(5)
@@ -950,8 +950,16 @@ mod tests {
     #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \
                                (values buffer)")]
     fn test_primitive_array_invalid_buffer_len() {
-        let data = ArrayData::builder(DataType::Int32).len(5).build().unwrap();
-        Int32Array::from(data);
+        let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]);
+        let data = unsafe {
+            ArrayData::builder(DataType::Int32)
+                .add_buffer(buffer.clone())
+                .add_buffer(buffer)
+                .len(5)
+                .build_unchecked()
+        };
+
+        drop(Int32Array::from(data));
     }
 
     #[test]
diff --git a/arrow/src/array/array_union.rs b/arrow/src/array/array_union.rs
index 6460e07..56efcfb 100644
--- a/arrow/src/array/array_union.rs
+++ b/arrow/src/array/array_union.rs
@@ -137,7 +137,10 @@ impl UnionArray {
             }
         }
 
-        Ok(Self::new(type_ids, value_offsets, child_arrays, bitmap))
+        let new_self = Self::new(type_ids, value_offsets, child_arrays, bitmap);
+        new_self.data().validate()?;
+
+        Ok(new_self)
     }
 
     /// Accesses the child array for `type_id`.
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index dbc5434..3ca14d8 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -18,11 +18,12 @@
 //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates
 //! common attributes and operations for Arrow array.
 
+use std::convert::TryInto;
 use std::mem;
 use std::sync::Arc;
 
 use crate::datatypes::{DataType, IntervalUnit};
-use crate::error::Result;
+use crate::error::{ArrowError, Result};
 use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};
 use crate::{
     buffer::{Buffer, MutableBuffer},
@@ -189,6 +190,28 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
     }
 }
 
+/// Ensures that at least `min_size` elements of type `data_type` can
+/// be stored in a buffer of `buffer_size`.
+///
+/// `buffer_index` is used in error messages to identify which buffer
+/// had the invalid index
+fn ensure_size(
+    data_type: &DataType,
+    min_size: usize,
+    buffer_size: usize,
+    buffer_index: usize,
+) -> Result<()> {
+    // if min_size is zero, may not have buffers (e.g. NullArray)
+    if min_size > 0 && buffer_size < min_size {
+        Err(ArrowError::InvalidArgumentError(format!(
+            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
+            buffer_size, buffer_index, data_type, min_size
+        )))
+    } else {
+        Ok(())
+    }
+}
+
 /// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`.
 #[inline]
 pub(crate) fn into_buffers(
@@ -313,18 +336,6 @@ impl ArrayData {
         Ok(new_self)
     }
 
-    /// Validates that buffers in this ArrayData are sufficiently
-    /// sized, to store `len` + `offset` total elements of
-    /// `data_type`.
-    ///
-    /// This check is "cheap" in the sense that it does not validate the
-    /// contents of the buffers (e.g. that string offsets for UTF8 arrays
-    /// are within the length of the buffer).
-    pub fn validate(&self) -> Result<()> {
-        // will be filled in a subsequent PR
-        Ok(())
-    }
-
     /// Returns a builder to construct a `ArrayData` instance.
     #[inline]
     pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
@@ -559,6 +570,437 @@ impl ArrayData {
             )
         }
     }
+
+    /// "cheap" validation of an `ArrayData`. Ensures buffers are
+    /// sufficiently sized to store `len` + `offset` total elements of
+    /// `data_type` and performs other inexpensive consistency checks.
+    ///
+    /// This check is "cheap" in the sense that it does not validate the
+    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
+    /// are within the bounds of the values buffer).
+    ///
+    /// TODO: add a validate_full that validates the offsets and valid utf8 data
+    pub fn validate(&self) -> Result<()> {
+        // Need at least this mich space in each buffer
+        let len_plus_offset = self.len + self.offset;
+
+        // Check that the data layout conforms to the spec
+        let layout = layout(&self.data_type);
+
+        // Will validate Union when conforms to new spec:
+        // https://github.com/apache/arrow-rs/issues/85
+        if matches!(&self.data_type, DataType::Union(_)) {
+            return Ok(());
+        }
+        if self.buffers.len() != layout.buffers.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Expected {} buffers in array of type {:?}, got {}",
+                layout.buffers.len(),
+                self.data_type,
+                self.buffers.len(),
+            )));
+        }
+
+        for (i, (buffer, spec)) in
+            self.buffers.iter().zip(layout.buffers.iter()).enumerate()
+        {
+            match spec {
+                BufferSpec::FixedWidth { byte_width } => {
+                    let min_buffer_size = len_plus_offset
+                        .checked_mul(*byte_width)
+                        .expect("integer overflow computing min buffer size");
+
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::VariableWidth => {
+                    // not cheap to validate (need to look at the
+                    // data). Partially checked in validate_offsets
+                    // called below. Can check with `validate_full`
+                }
+                BufferSpec::BitMap => {
+                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
+                    if buffer.len() < min_buffer_size {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
+                            min_buffer_size, i, self.data_type, buffer.len()
+                        )));
+                    }
+                }
+                BufferSpec::AlwaysNull => {
+                    // Nothing to validate
+                }
+            }
+        }
+
+        if self.null_count > self.len {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "null_count {} for an array exceeds length of {} elements",
+                self.null_count, self.len
+            )));
+        }
+
+        // check null bit buffer size
+        if let Some(null_bit_buffer) = self.null_bitmap.as_ref() {
+            let needed_len = bit_util::ceil(len_plus_offset, 8);
+            if null_bit_buffer.len() < needed_len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "null_bit_buffer size too small. got {} needed {}",
+                    null_bit_buffer.len(),
+                    needed_len
+                )));
+            }
+        } else if self.null_count > 0 {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Array of type {} has {} nulls but no null bitmap",
+                self.data_type, self.null_count
+            )));
+        }
+
+        self.validate_child_data()?;
+
+        // Additional Type specific checks
+        match &self.data_type {
+            DataType::Utf8 | DataType::Binary => {
+                self.validate_offsets::<i32>(&self.buffers[0], self.buffers[1].len())?;
+            }
+            DataType::LargeUtf8 | DataType::LargeBinary => {
+                self.validate_offsets::<i64>(&self.buffers[0], self.buffers[1].len())?;
+            }
+            DataType::Dictionary(key_type, _value_type) => {
+                // At the moment, constructing a DictionaryArray will also check this
+                if !DataType::is_dictionary_key_type(key_type) {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Dictionary values must be integer, but was {}",
+                        key_type
+                    )));
+                }
+            }
+            _ => {}
+        };
+
+        Ok(())
+    }
+
+    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
+    /// offsets (of type T> into some other buffer of `values_length` bytes long
+    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
+        &self,
+        buffer: &Buffer,
+        values_length: usize,
+    ) -> Result<()> {
+        // Validate that there are the correct number of offsets for this array's length
+        let required_offsets = self.len + self.offset + 1;
+
+        // An empty list-like array can have 0 offsets
+        if buffer.is_empty() {
+            return Ok(());
+        }
+
+        if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}",
+                buffer.len(), self.data_type, self.len, required_offsets
+            )));
+        }
+
+        // Justification: buffer size was validated above
+        let offsets = unsafe { &(buffer.typed_data::<T>()[self.offset..]) };
+
+        let first_offset = offsets[0].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[0] ({}) to usize for {}",
+                offsets[0], self.data_type
+            ))
+        })?;
+
+        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Error converting offset[{}] ({}) to usize for {}",
+                self.len, offsets[self.len], self.data_type
+            ))
+        })?;
+
+        if first_offset > values_length {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "First offset {} of {} is larger than values length {}",
+                first_offset, self.data_type, values_length,
+            )));
+        }
+
+        if last_offset > values_length {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Last offset {} of {} is larger than values length {}",
+                last_offset, self.data_type, values_length,
+            )));
+        }
+
+        if first_offset > last_offset {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "First offset {} in {} is smaller than last offset {}",
+                first_offset, self.data_type, last_offset,
+            )));
+        }
+
+        Ok(())
+    }
+
+    /// Validates the layout of `child_data` ArrayData structures
+    fn validate_child_data(&self) -> Result<()> {
+        match &self.data_type {
+            DataType::List(field) | DataType::Map(field, _) => {
+                let values_data = self.get_single_valid_child_data(field.data_type())?;
+                self.validate_offsets::<i32>(&self.buffers[0], values_data.len)?;
+                Ok(())
+            }
+            DataType::LargeList(field) => {
+                let values_data = self.get_single_valid_child_data(field.data_type())?;
+                self.validate_offsets::<i64>(&self.buffers[0], values_data.len)?;
+                Ok(())
+            }
+            DataType::FixedSizeList(field, list_size) => {
+                let values_data = self.get_single_valid_child_data(field.data_type())?;
+
+                let list_size: usize = (*list_size).try_into().map_err(|_| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "{} has a negative list_size {}",
+                        self.data_type, list_size
+                    ))
+                })?;
+
+                let expected_values_len = self.len
+                    .checked_mul(list_size)
+                    .expect("integer overflow computing expected number of expected values in FixedListSize");
+
+                if values_data.len < expected_values_len {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
+                        values_data.len, list_size, list_size, self.data_type
+                    )));
+                }
+
+                Ok(())
+            }
+            DataType::Struct(fields) => {
+                self.validate_num_child_data(fields.len())?;
+                for (i, field) in fields.iter().enumerate() {
+                    let field_data = self.get_valid_child_data(i, field.data_type())?;
+
+                    // C++ does this check, but it is not clear why
+                    // field_data checks only len, but self checks len+offset
+                    if field_data.len < (self.len + self.offset) {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
+                            self.data_type, i, field.name(), field_data.len, self.len + self.offset
+                        )));
+                    }
+                }
+                Ok(())
+            }
+            DataType::Union(_fields) => {
+                // Validate Union Array as part of implementing new Union semantics
+                // See comments in `ArrayData::validate()`
+                // https://github.com/apache/arrow-rs/issues/85
+                Ok(())
+            }
+            DataType::Dictionary(_key_type, value_type) => {
+                self.get_single_valid_child_data(value_type)?;
+                Ok(())
+            }
+            _ => {
+                // other types do not have child data
+                if !self.child_data.is_empty() {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "Expected no child arrays for type {} but got {}",
+                        self.data_type,
+                        self.child_data.len()
+                    )));
+                }
+                Ok(())
+            }
+        }
+    }
+
+    /// Ensures that this array data has a single child_data with the
+    /// expected type, and calls `validate()` on it. Returns a
+    /// reference to that child_data
+    fn get_single_valid_child_data(
+        &self,
+        expected_type: &DataType,
+    ) -> Result<&ArrayData> {
+        self.validate_num_child_data(1)?;
+        self.get_valid_child_data(0, expected_type)
+    }
+
+    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
+    fn validate_num_child_data(&self, expected_len: usize) -> Result<()> {
+        if self.child_data().len() != expected_len {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "Value data for {} should contain {} child data array(s), had {}",
+                self.data_type(),
+                expected_len,
+                self.child_data.len()
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Ensures that `child_data[i]` has the expected type, calls
+    /// `validate()` on it, and returns a reference to that child_data
+    fn get_valid_child_data(
+        &self,
+        i: usize,
+        expected_type: &DataType,
+    ) -> Result<&ArrayData> {
+        let values_data = self.child_data
+            .get(i)
+            .ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "{} did not have enough child arrays. Expected at least {} but had only {}",
+                    self.data_type, i+1, self.child_data.len()
+                ))
+            })?;
+
+        if expected_type != &values_data.data_type {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Child type mismatch for {}. Expected {} but child data had {}",
+                self.data_type, expected_type, values_data.data_type
+            )));
+        }
+
+        values_data.validate()?;
+        Ok(values_data)
+    }
+}
+
+/// Return the expected [`DataTypeLayout`] Arrays of this data
+/// type are expected to have
+fn layout(data_type: &DataType) -> DataTypeLayout {
+    // based on C/C++ implementation in
+    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
+    use std::mem::size_of;
+    match data_type {
+        DataType::Null => DataTypeLayout::new_empty(),
+        DataType::Boolean => DataTypeLayout {
+            buffers: vec![BufferSpec::BitMap],
+        },
+        DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::<i8>()),
+        DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::<i16>()),
+        DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::<u8>()),
+        DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::<u16>()),
+        DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::<u32>()),
+        DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::<u64>()),
+        DataType::Float16 => unimplemented!(),
+        DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::<f32>()),
+        DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::<f64>()),
+        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Time32(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Time64(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Interval(IntervalUnit::YearMonth) => {
+            DataTypeLayout::new_fixed_width(size_of::<i32>())
+        }
+        DataType::Interval(IntervalUnit::DayTime) => {
+            DataTypeLayout::new_fixed_width(size_of::<i64>())
+        }
+        DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
+        DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()),
+        DataType::FixedSizeBinary(bytes_per_value) => {
+            let bytes_per_value: usize = (*bytes_per_value)
+                .try_into()
+                .expect("negative size for fixed size binary");
+            DataTypeLayout::new_fixed_width(bytes_per_value)
+        }
+        DataType::LargeBinary => DataTypeLayout::new_binary(size_of::<i64>()),
+        DataType::Utf8 => DataTypeLayout::new_binary(size_of::<i32>()),
+        DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::<i64>()),
+        DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data
+        DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()),
+        DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data,
+        DataType::Union(_) => {
+            DataTypeLayout::new_fixed_width(size_of::<u8>())
+            // Note sparse unions only have one buffer (u8) type_ids,
+            // and dense unions have 2 (type_ids as well as offsets).
+            // https://github.com/apache/arrow-rs/issues/85
+        }
+        DataType::Dictionary(key_type, _value_type) => layout(key_type),
+        DataType::Decimal(_, _) => {
+            // Decimals are always some fixed width; The rust implemenation
+            // always uses 16 bytes / size of i128
+            DataTypeLayout::new_fixed_width(size_of::<i128>())
+        }
+        DataType::Map(_, _) => {
+            // same as ListType
+            DataTypeLayout::new_fixed_width(size_of::<i32>())
+        }
+    }
+}
+
+/// Layout specification for a data type
+#[derive(Debug, PartialEq)]
+// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
+struct DataTypeLayout {
+    /// A vector of buffer layout specifications, one for each expected buffer
+    pub buffers: Vec<BufferSpec>,
+}
+
+impl DataTypeLayout {
+    /// Describes a basic numeric array where each element has a fixed width
+    pub fn new_fixed_width(byte_width: usize) -> Self {
+        Self {
+            buffers: vec![BufferSpec::FixedWidth { byte_width }],
+        }
+    }
+
+    /// Describes arrays which have no data of their own
+    /// (e.g. FixedSizeList). Note such arrays may still have a Null
+    /// Bitmap
+    pub fn new_empty() -> Self {
+        Self { buffers: vec![] }
+    }
+
+    /// Describes a basic numeric array where each element has a fixed
+    /// with offset buffer of `offset_byte_width` bytes, followed by a
+    /// variable width data buffer
+    pub fn new_binary(offset_byte_width: usize) -> Self {
+        Self {
+            buffers: vec![
+                // offsets
+                BufferSpec::FixedWidth {
+                    byte_width: offset_byte_width,
+                },
+                // values
+                BufferSpec::VariableWidth,
+            ],
+        }
+    }
+}
+
+/// Layout specification for a single data type buffer
+#[derive(Debug, PartialEq)]
+enum BufferSpec {
+    /// each element has a fixed width
+    FixedWidth { byte_width: usize },
+    /// Variable width, such as string data for utf8 data
+    VariableWidth,
+    /// Buffer holds a bitmap.
+    ///
+    /// Note: Unlike the C++ implementation, the null/validity buffer
+    /// is handled specially rather than as another of the buffers in
+    /// the spec, so this variant is only used for the Boolean type.
+    BitMap,
+    /// Buffer is always null. Unused currently in Rust implementation,
+    /// (used in C++ for Union type)
+    AlwaysNull,
 }
 
 impl PartialEq for ArrayData {
@@ -672,23 +1114,38 @@ impl ArrayDataBuilder {
 mod tests {
     use super::*;
 
+    use crate::array::{Array, Int32Array, StringArray};
     use crate::buffer::Buffer;
+    use crate::datatypes::Field;
     use crate::util::bit_util;
 
     #[test]
-    fn test_new() {
-        let arr_data =
-            ArrayData::try_new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![])
-                .unwrap();
-        assert_eq!(10, arr_data.len());
-        assert_eq!(1, arr_data.null_count());
-        assert_eq!(2, arr_data.offset());
-        assert_eq!(0, arr_data.buffers().len());
-        assert_eq!(0, arr_data.child_data().len());
+    fn test_builder() {
+        // Buffer needs to be at least 25 long
+        let v = (0..25).collect::<Vec<i32>>();
+        let b1 = Buffer::from_slice_ref(&v);
+        let arr_data = ArrayData::builder(DataType::Int32)
+            .len(20)
+            .offset(5)
+            .add_buffer(b1)
+            .null_bit_buffer(Buffer::from(vec![
+                0b01011111, 0b10110101, 0b01100011, 0b00011110,
+            ]))
+            .build()
+            .unwrap();
+
+        assert_eq!(20, arr_data.len());
+        assert_eq!(10, arr_data.null_count());
+        assert_eq!(5, arr_data.offset());
+        assert_eq!(1, arr_data.buffers().len());
+        assert_eq!(
+            Buffer::from_slice_ref(&v).as_slice(),
+            arr_data.buffers()[0].as_slice()
+        );
     }
 
     #[test]
-    fn test_builder() {
+    fn test_builder_with_child_data() {
         let child_arr_data = ArrayData::try_new(
             DataType::Int32,
             5,
@@ -699,24 +1156,17 @@ mod tests {
             vec![],
         )
         .unwrap();
-        let v = vec![0, 1, 2, 3];
-        let b1 = Buffer::from(&v[..]);
-        let arr_data = ArrayData::builder(DataType::Int32)
-            .len(20)
-            .offset(5)
-            .add_buffer(b1)
-            .null_bit_buffer(Buffer::from(vec![
-                0b01011111, 0b10110101, 0b01100011, 0b00011110,
-            ]))
+
+        let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]);
+
+        let arr_data = ArrayData::builder(data_type)
+            .len(5)
+            .offset(0)
             .add_child_data(child_arr_data.clone())
             .build()
             .unwrap();
 
-        assert_eq!(20, arr_data.len());
-        assert_eq!(10, arr_data.null_count());
-        assert_eq!(5, arr_data.offset());
-        assert_eq!(1, arr_data.buffers().len());
-        assert_eq!(&[0, 1, 2, 3], arr_data.buffers()[0].as_slice());
+        assert_eq!(5, arr_data.len());
         assert_eq!(1, arr_data.child_data().len());
         assert_eq!(child_arr_data, arr_data.child_data()[0]);
     }
@@ -729,6 +1179,7 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 10);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(16)
+            .add_buffer(make_i32_buffer(16))
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -742,6 +1193,7 @@ mod tests {
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(12)
             .offset(2)
+            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -756,6 +1208,7 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 10);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(16)
+            .add_buffer(make_i32_buffer(16))
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -771,6 +1224,7 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 10);
         let data = ArrayData::builder(DataType::Int32)
             .len(16)
+            .add_buffer(make_i32_buffer(16))
             .null_bit_buffer(Buffer::from(bit_v))
             .build()
             .unwrap();
@@ -788,8 +1242,16 @@ mod tests {
 
     #[test]
     fn test_equality() {
-        let int_data = ArrayData::builder(DataType::Int32).build().unwrap();
-        let float_data = ArrayData::builder(DataType::Float32).build().unwrap();
+        let int_data = ArrayData::builder(DataType::Int32)
+            .len(1)
+            .add_buffer(make_i32_buffer(1))
+            .build()
+            .unwrap();
+        let float_data = ArrayData::builder(DataType::Float32)
+            .len(1)
+            .add_buffer(make_f32_buffer(1))
+            .build()
+            .unwrap();
         assert_ne!(int_data, float_data);
     }
 
@@ -802,4 +1264,380 @@ mod tests {
         let count = count_nulls(null_buffer.as_ref(), 4, 8);
         assert_eq!(count, 3);
     }
+
+    #[test]
+    #[should_panic(
+        expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8"
+    )]
+    fn test_buffer_too_small() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8)
+        ArrayData::try_new(DataType::Int64, 10, Some(0), None, 0, vec![buffer], vec![])
+            .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8"
+    )]
+    fn test_buffer_too_small_offset() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        // should fail -- size is ok, but also has offset
+        ArrayData::try_new(DataType::Int64, 1, Some(0), None, 1, vec![buffer], vec![])
+            .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")]
+    fn test_bad_number_of_buffers() {
+        let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]);
+        let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(
+            DataType::Int64,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![buffer1, buffer2],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "integer overflow computing min buffer size")]
+    fn test_fixed_width_overflow() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(
+            DataType::Int64,
+            usize::MAX,
+            Some(0),
+            None,
+            0,
+            vec![buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "null_bit_buffer size too small. got 8 needed 13")]
+    fn test_bitmap_too_small() {
+        let buffer = make_i32_buffer(100);
+        let null_bit_buffer = Buffer::from(vec![0b11111111]);
+
+        ArrayData::try_new(
+            DataType::Int32,
+            100,
+            Some(0),
+            Some(null_bit_buffer),
+            0,
+            vec![buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "null_count 3 for an array exceeds length of 2 elements")]
+    fn test_bad_null_count() {
+        let buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(DataType::Int32, 2, Some(3), None, 0, vec![buffer], vec![])
+            .unwrap();
+    }
+
+    // Test creating a dictionary with a non integer type
+    #[test]
+    #[should_panic(expected = "Dictionary values must be integer, but was Utf8")]
+    fn test_non_int_dictionary() {
+        let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        let data_type =
+            DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32));
+        let child_data = ArrayData::try_new(
+            DataType::Int32,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![i32_buffer.clone()],
+            vec![],
+        )
+        .unwrap();
+        ArrayData::try_new(
+            data_type,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![i32_buffer.clone(), i32_buffer],
+            vec![child_data],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")]
+    fn test_mismatched_dictionary_types() {
+        // test w/ dictionary created with a child array data that has type different than declared
+        let string_array: StringArray =
+            vec![Some("foo"), Some("bar")].into_iter().collect();
+        let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]);
+        // Dict says LargeUtf8 but array is Utf8
+        let data_type = DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::LargeUtf8),
+        );
+        let child_data = string_array.data().clone();
+        ArrayData::try_new(
+            data_type,
+            1,
+            Some(0),
+            None,
+            0,
+            vec![i32_buffer],
+            vec![child_data],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3"
+    )]
+    fn test_validate_offsets_i32() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offsets buffer size (bytes): 16 isn't large enough for LargeUtf8. Length 2 needs 3"
+    )]
+    fn test_validate_offsets_i64() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]);
+        ArrayData::try_new(
+            DataType::LargeUtf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")]
+    fn test_validate_offsets_negative_first_i32() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")]
+    fn test_validate_offsets_negative_last_i32() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")]
+    fn test_validate_offsets_range_too_small() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // start offset is larger than end
+        let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")]
+    fn test_validate_offsets_range_too_large() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")]
+    fn test_validate_offsets_first_too_large() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer
+        let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_validate_offsets_first_too_large_skipped() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer, but offset starts at 1 so it is skipped
+        let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]);
+        let data = ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            1,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+        let array: StringArray = data.into();
+        let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect();
+        assert_eq!(array, expected);
+    }
+
+    #[test]
+    #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")]
+    fn test_validate_offsets_last_too_large() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        // 10 is off the end of the buffer
+        let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            2,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"
+    )]
+    fn test_validate_fixed_size_list() {
+        // child has 4 elements,
+        let child_array = vec![Some(1), Some(2), Some(3), None]
+            .into_iter()
+            .collect::<Int32Array>();
+
+        // but claim we have 3 elements for a fixed size of 2
+        // 10 is off the end of the buffer
+        let field = Field::new("field", DataType::Int32, true);
+        ArrayData::try_new(
+            DataType::FixedSizeList(Box::new(field), 2),
+            3,
+            None,
+            None,
+            0,
+            vec![],
+            vec![child_array.data().clone()],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "Child type mismatch for Struct")]
+    fn test_validate_struct_child_type() {
+        let field1 = vec![Some(1), Some(2), Some(3), None]
+            .into_iter()
+            .collect::<Int32Array>();
+
+        // validate the the type of struct fields matches child fields
+        ArrayData::try_new(
+            DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]),
+            3,
+            None,
+            None,
+            0,
+            vec![],
+            vec![field1.data().clone()],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)"
+    )]
+    fn test_validate_struct_child_length() {
+        // field length only has 4 items, but array claims to have 6
+        let field1 = vec![Some(1), Some(2), Some(3), None]
+            .into_iter()
+            .collect::<Int32Array>();
+
+        ArrayData::try_new(
+            DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]),
+            6,
+            None,
+            None,
+            0,
+            vec![],
+            vec![field1.data().clone()],
+        )
+        .unwrap();
+    }
+
+    /// returns a buffer initialized with some constant value for tests
+    fn make_i32_buffer(n: usize) -> Buffer {
+        Buffer::from_slice_ref(&vec![42i32; n])
+    }
+
+    /// returns a buffer initialized with some constant value for tests
+    fn make_f32_buffer(n: usize) -> Buffer {
+        Buffer::from_slice_ref(&vec![42f32; n])
+    }
 }
diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs
index cd5d4a0..13fcf92 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -2153,6 +2153,7 @@ mod tests {
         assert_eq!(4, values.null_count());
         let u16arr = values.as_any().downcast_ref::<UInt16Array>().unwrap();
 
+        // expect 4 nulls: negative numbers and overflow
         let expected: UInt16Array =
             vec![Some(0), Some(0), Some(0), None, None, None, Some(2), None]
                 .into_iter()
diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs
index f4ddbaf..e4808e2 100644
--- a/arrow/src/compute/util.rs
+++ b/arrow/src/compute/util.rs
@@ -184,7 +184,8 @@ pub(super) mod tests {
         offset: usize,
         null_bit_buffer: Option<Buffer>,
     ) -> Arc<ArrayData> {
-        // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap
+        let buffer = Buffer::from(&vec![11; len]);
+
         Arc::new(
             ArrayData::try_new(
                 DataType::UInt8,
@@ -192,7 +193,7 @@ pub(super) mod tests {
                 None,
                 null_bit_buffer,
                 offset,
-                vec![],
+                vec![buffer],
                 vec![],
             )
             .unwrap(),
diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
index 1cbec34..96fb18b 100644
--- a/arrow/src/datatypes/datatype.rs
+++ b/arrow/src/datatypes/datatype.rs
@@ -477,6 +477,16 @@ impl DataType {
         )
     }
 
+    /// Returns true if this type is valid as a dictionary key
+    /// (e.g. [`super::ArrowDictionaryKeyType`]
+    pub fn is_dictionary_key_type(t: &DataType) -> bool {
+        use DataType::*;
+        matches!(
+            t,
+            UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64
+        )
+    }
+
     /// Compares the datatype with another, ignoring nested field names
     /// and metadata.
     pub(crate) fn equals_datatype(&self, other: &DataType) -> bool {
diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs
index e925e2a..82cd101 100644
--- a/arrow/src/ipc/reader.rs
+++ b/arrow/src/ipc/reader.rs
@@ -969,13 +969,28 @@ mod tests {
     }
 
     #[test]
+    #[should_panic(
+        expected = "Last offset 687865856 of Utf8 is larger than values length 41"
+    )]
+    fn read_dictionary_be_not_implemented() {
+        // The offsets are not translated for big-endian files
+        // https://github.com/apache/arrow-rs/issues/859
+        let testdata = crate::util::test_util::arrow_test_data();
+        let file = File::open(format!(
+                "{}/arrow-ipc-stream/integration/1.0.0-bigendian/generated_dictionary.arrow_file",
+                testdata
+            ))
+            .unwrap();
+        FileReader::try_new(file).unwrap();
+    }
+
+    #[test]
     fn read_generated_be_files_should_work() {
         // complementary to the previous test
         let testdata = crate::util::test_util::arrow_test_data();
         let paths = vec![
             "generated_interval",
             "generated_datetime",
-            "generated_dictionary",
             "generated_map",
             "generated_nested",
             "generated_null_trivial",