You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/14 17:30:21 UTC

[arrow-rs] branch master updated: Add CI check for full validation mode (#1546)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 3fed612b3 Add CI check for full validation mode (#1546)
3fed612b3 is described below

commit 3fed612b3bf57cb054a65eaba79299b89dcf733e
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Thu Apr 14 13:30:16 2022 -0400

    Add CI check for full validation mode (#1546)
    
    * Add force_validate feature
    
    * Disable some redundant checks
    
    * Add issue link
    
    * Add test with force_validate feature flag
    
    * fix up message
    
    * disable due to https://github.com/apache/arrow-rs/issues/1547
    
    * disable ipc test failure
    
    * fix clippy
    
    * Fix doctest to pass with force_validate enabled
---
 .github/workflows/rust.yml             |  4 +++-
 arrow/Cargo.toml                       |  4 ++++
 arrow/src/array/array_binary.rs        |  3 +++
 arrow/src/array/array_boolean.rs       |  3 +++
 arrow/src/array/array_list.rs          | 12 ++++++++++++
 arrow/src/array/array_primitive.rs     |  3 +++
 arrow/src/array/data.rs                | 12 +++++++++---
 arrow/src/compute/kernels/filter.rs    |  6 ++++++
 arrow/src/compute/kernels/substring.rs | 10 +++++++++-
 arrow/src/ipc/reader.rs                |  2 ++
 10 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index e28ebe6e1..d6032e0f1 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -111,7 +111,9 @@ jobs:
 
           # Switch to arrow crate
           cd arrow
-          # re-run tests on arrow crate with additional features
+          # re-run tests on arrow crate to ensure
+          # all arrays are created correctly
+          cargo test --features=force_validate
           cargo test --features=prettyprint
           # run test on arrow crate with minimal set of features
           cargo test --no-default-features
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 343d9f2f8..016ca54f8 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -71,6 +71,10 @@ prettyprint = ["comfy-table"]
 # target without assuming an environment containing JavaScript.
 test_utils = ["rand"]
 pyarrow = ["pyo3"]
+# force_validate runs full data validation for all arrays that are created
+# this is not enabled by default as it is too computationally expensive
+# but is run as part of our CI checks
+force_validate = []
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index ee64818f7..b78f47126 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -1426,6 +1426,9 @@ mod tests {
     #[should_panic(
         expected = "FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays"
     )]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_fixed_size_binary_array_from_incorrect_list_array() {
         let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
         let values_data = ArrayData::builder(DataType::UInt32)
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index 12cecd411..f4e9ce28b 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -350,6 +350,9 @@ mod tests {
     #[test]
     #[should_panic(expected = "BooleanArray data should contain a single buffer only \
                                (values buffer)")]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_boolean_array_invalid_buffer_len() {
         let data = unsafe {
             ArrayData::builder(DataType::Boolean)
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index e91868358..c1d948b92 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -777,6 +777,9 @@ mod tests {
     #[should_panic(
         expected = "FixedSizeListArray child array length should be a multiple of 3"
     )]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_fixed_size_list_array_unequal_children() {
         // Construct a value array
         let value_data = ArrayData::builder(DataType::Int32)
@@ -1065,6 +1068,9 @@ mod tests {
     #[should_panic(
         expected = "ListArray data should contain a single buffer only (value offsets)"
     )]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_list_array_invalid_buffer_len() {
         let value_data = unsafe {
             ArrayData::builder(DataType::Int32)
@@ -1087,6 +1093,9 @@ mod tests {
     #[should_panic(
         expected = "ListArray should contain a single child array (values array)"
     )]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_list_array_invalid_child_array_len() {
         let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]);
         let list_data_type =
@@ -1137,6 +1146,9 @@ mod tests {
 
     #[test]
     #[should_panic(expected = "memory is not aligned")]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_list_array_alignment() {
         let ptr = alloc::allocate_aligned::<u8>(8);
         let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index 23e1c1f1a..e4146e768 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -1028,6 +1028,9 @@ mod tests {
     #[test]
     #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \
                                (values buffer)")]
+    // Different error messages, so skip for now
+    // https://github.com/apache/arrow-rs/issues/1545
+    #[cfg(not(feature = "force_validate"))]
     fn test_primitive_array_invalid_buffer_len() {
         let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]);
         let data = unsafe {
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 8622c0a2c..f2dadcf78 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -272,6 +272,7 @@ impl ArrayData {
     /// Note: This is a low level API and most users of the arrow
     /// crate should create arrays using the methods in the `array`
     /// module.
+    #[allow(clippy::let_and_return)]
     pub unsafe fn new_unchecked(
         data_type: DataType,
         len: usize,
@@ -286,7 +287,7 @@ impl ArrayData {
             Some(null_count) => null_count,
         };
         let null_bitmap = null_bit_buffer.map(Bitmap::from);
-        Self {
+        let new_self = Self {
             data_type,
             len,
             null_count,
@@ -294,7 +295,12 @@ impl ArrayData {
             buffers,
             child_data,
             null_bitmap,
-        }
+        };
+
+        // Provide a force_validate mode
+        #[cfg(feature = "force_validate")]
+        new_self.validate_full().unwrap();
+        new_self
     }
 
     /// Create a new ArrayData, validating that the provided buffers
@@ -2340,7 +2346,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "child #0 invalid: Invalid argument error: Value at position 1 out of bounds: -1 (should be in [0, 1])"
+        expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])"
     )]
     /// test that children are validated recursively (aka bugs in child data of struct also are flagged)
     fn test_validate_recursive() {
diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs
index df59ba63c..4ab5b180b 100644
--- a/arrow/src/compute/kernels/filter.rs
+++ b/arrow/src/compute/kernels/filter.rs
@@ -1692,6 +1692,9 @@ mod tests {
     }
 
     #[test]
+    // Fails when validation enabled
+    // https://github.com/apache/arrow-rs/issues/1547
+    #[cfg(not(feature = "force_validate"))]
     fn test_filter_union_array_sparse() {
         let mut builder = UnionBuilder::new_sparse(3);
         builder.append::<Int32Type>("A", 1).unwrap();
@@ -1703,6 +1706,9 @@ mod tests {
     }
 
     #[test]
+    // Fails when validation enabled
+    // https://github.com/apache/arrow-rs/issues/1547
+    #[cfg(not(feature = "force_validate"))]
     fn test_filter_union_array_sparse_with_nulls() {
         let mut builder = UnionBuilder::new_sparse(4);
         builder.append::<Int32Type>("A", 1).unwrap();
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index 6e75f74ef..c452a8a73 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -109,15 +109,23 @@ fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
 ///
 /// # Warning
 ///
-/// This function **might** return in invalid utf-8 format if the character length falls on a non-utf8 boundary.
+/// This function **might** return in invalid utf-8 format if the
+/// character length falls on a non-utf8 boundary, which we
+/// [hope to fix](https://github.com/apache/arrow-rs/issues/1531)
+/// in a future release.
+///
 /// ## Example of getting an invalid substring
 /// ```
+/// # // Doesn't pass due to  https://github.com/apache/arrow-rs/issues/1531
+/// # #[cfg(not(feature = "force_validate"))]
+/// # {
 /// # use arrow::array::StringArray;
 /// # use arrow::compute::kernels::substring::substring;
 /// let array = StringArray::from(vec![Some("E=mc²")]);
 /// let result = substring(&array, -1, &None).unwrap();
 /// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
 /// assert_eq!(result.value(0).as_bytes(), &[0x00B2]); // invalid utf-8 format
+/// # }
 /// ```
 ///
 /// # Error
diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs
index 19579968d..1134ed425 100644
--- a/arrow/src/ipc/reader.rs
+++ b/arrow/src/ipc/reader.rs
@@ -1107,6 +1107,8 @@ mod tests {
     }
 
     #[test]
+    // https://github.com/apache/arrow-rs/issues/1548
+    #[cfg(not(feature = "force_validate"))]
     fn projection_should_work() {
         // complementary to the previous test
         let testdata = crate::util::test_util::arrow_test_data();