You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/14 17:30:21 UTC
[arrow-rs] branch master updated: Add CI check for full validation mode (#1546)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 3fed612b3 Add CI check for full validation mode (#1546)
3fed612b3 is described below
commit 3fed612b3bf57cb054a65eaba79299b89dcf733e
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Thu Apr 14 13:30:16 2022 -0400
Add CI check for full validation mode (#1546)
* Add force_validate feature
* Disable some redundant checks
* Add issue link
* Add test with force_validate feature flag
* fix up message
* disable due to https://github.com/apache/arrow-rs/issues/1547
* disable ipc test failure
* fix clippy
* Fix doctest to pass with force_validate enabled
---
.github/workflows/rust.yml | 4 +++-
arrow/Cargo.toml | 4 ++++
arrow/src/array/array_binary.rs | 3 +++
arrow/src/array/array_boolean.rs | 3 +++
arrow/src/array/array_list.rs | 12 ++++++++++++
arrow/src/array/array_primitive.rs | 3 +++
arrow/src/array/data.rs | 12 +++++++++---
arrow/src/compute/kernels/filter.rs | 6 ++++++
arrow/src/compute/kernels/substring.rs | 10 +++++++++-
arrow/src/ipc/reader.rs | 2 ++
10 files changed, 54 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index e28ebe6e1..d6032e0f1 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -111,7 +111,9 @@ jobs:
# Switch to arrow crate
cd arrow
- # re-run tests on arrow crate with additional features
+ # re-run tests on arrow crate to ensure
+ # all arrays are created correctly
+ cargo test --features=force_validate
cargo test --features=prettyprint
# run test on arrow crate with minimal set of features
cargo test --no-default-features
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 343d9f2f8..016ca54f8 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -71,6 +71,10 @@ prettyprint = ["comfy-table"]
# target without assuming an environment containing JavaScript.
test_utils = ["rand"]
pyarrow = ["pyo3"]
+# force_validate runs full data validation for all arrays that are created
+# this is not enabled by default as it is too computationally expensive
+# but is run as part of our CI checks
+force_validate = []
[dev-dependencies]
rand = "0.8"
diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs
index ee64818f7..b78f47126 100644
--- a/arrow/src/array/array_binary.rs
+++ b/arrow/src/array/array_binary.rs
@@ -1426,6 +1426,9 @@ mod tests {
#[should_panic(
expected = "FixedSizeBinaryArray can only be created from FixedSizeList<u8> arrays"
)]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_fixed_size_binary_array_from_incorrect_list_array() {
let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
let values_data = ArrayData::builder(DataType::UInt32)
diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs
index 12cecd411..f4e9ce28b 100644
--- a/arrow/src/array/array_boolean.rs
+++ b/arrow/src/array/array_boolean.rs
@@ -350,6 +350,9 @@ mod tests {
#[test]
#[should_panic(expected = "BooleanArray data should contain a single buffer only \
(values buffer)")]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_boolean_array_invalid_buffer_len() {
let data = unsafe {
ArrayData::builder(DataType::Boolean)
diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs
index e91868358..c1d948b92 100644
--- a/arrow/src/array/array_list.rs
+++ b/arrow/src/array/array_list.rs
@@ -777,6 +777,9 @@ mod tests {
#[should_panic(
expected = "FixedSizeListArray child array length should be a multiple of 3"
)]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_fixed_size_list_array_unequal_children() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
@@ -1065,6 +1068,9 @@ mod tests {
#[should_panic(
expected = "ListArray data should contain a single buffer only (value offsets)"
)]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_list_array_invalid_buffer_len() {
let value_data = unsafe {
ArrayData::builder(DataType::Int32)
@@ -1087,6 +1093,9 @@ mod tests {
#[should_panic(
expected = "ListArray should contain a single child array (values array)"
)]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_list_array_invalid_child_array_len() {
let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]);
let list_data_type =
@@ -1137,6 +1146,9 @@ mod tests {
#[test]
#[should_panic(expected = "memory is not aligned")]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_list_array_alignment() {
let ptr = alloc::allocate_aligned::<u8>(8);
let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) };
diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs
index 23e1c1f1a..e4146e768 100644
--- a/arrow/src/array/array_primitive.rs
+++ b/arrow/src/array/array_primitive.rs
@@ -1028,6 +1028,9 @@ mod tests {
#[test]
#[should_panic(expected = "PrimitiveArray data should contain a single buffer only \
(values buffer)")]
+ // Different error messages, so skip for now
+ // https://github.com/apache/arrow-rs/issues/1545
+ #[cfg(not(feature = "force_validate"))]
fn test_primitive_array_invalid_buffer_len() {
let buffer = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4]);
let data = unsafe {
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 8622c0a2c..f2dadcf78 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -272,6 +272,7 @@ impl ArrayData {
/// Note: This is a low level API and most users of the arrow
/// crate should create arrays using the methods in the `array`
/// module.
+ #[allow(clippy::let_and_return)]
pub unsafe fn new_unchecked(
data_type: DataType,
len: usize,
@@ -286,7 +287,7 @@ impl ArrayData {
Some(null_count) => null_count,
};
let null_bitmap = null_bit_buffer.map(Bitmap::from);
- Self {
+ let new_self = Self {
data_type,
len,
null_count,
@@ -294,7 +295,12 @@ impl ArrayData {
buffers,
child_data,
null_bitmap,
- }
+ };
+
+ // Provide a force_validate mode
+ #[cfg(feature = "force_validate")]
+ new_self.validate_full().unwrap();
+ new_self
}
/// Create a new ArrayData, validating that the provided buffers
@@ -2340,7 +2346,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "child #0 invalid: Invalid argument error: Value at position 1 out of bounds: -1 (should be in [0, 1])"
+ expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])"
)]
/// test that children are validated recursively (aka bugs in child data of struct also are flagged)
fn test_validate_recursive() {
diff --git a/arrow/src/compute/kernels/filter.rs b/arrow/src/compute/kernels/filter.rs
index df59ba63c..4ab5b180b 100644
--- a/arrow/src/compute/kernels/filter.rs
+++ b/arrow/src/compute/kernels/filter.rs
@@ -1692,6 +1692,9 @@ mod tests {
}
#[test]
+ // Fails when validation enabled
+ // https://github.com/apache/arrow-rs/issues/1547
+ #[cfg(not(feature = "force_validate"))]
fn test_filter_union_array_sparse() {
let mut builder = UnionBuilder::new_sparse(3);
builder.append::<Int32Type>("A", 1).unwrap();
@@ -1703,6 +1706,9 @@ mod tests {
}
#[test]
+ // Fails when validation enabled
+ // https://github.com/apache/arrow-rs/issues/1547
+ #[cfg(not(feature = "force_validate"))]
fn test_filter_union_array_sparse_with_nulls() {
let mut builder = UnionBuilder::new_sparse(4);
builder.append::<Int32Type>("A", 1).unwrap();
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index 6e75f74ef..c452a8a73 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -109,15 +109,23 @@ fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
///
/// # Warning
///
-/// This function **might** return in invalid utf-8 format if the character length falls on a non-utf8 boundary.
+/// This function **might** return in invalid utf-8 format if the
+/// character length falls on a non-utf8 boundary, which we
+/// [hope to fix](https://github.com/apache/arrow-rs/issues/1531)
+/// in a future release.
+///
/// ## Example of getting an invalid substring
/// ```
+/// # // Doesn't pass due to https://github.com/apache/arrow-rs/issues/1531
+/// # #[cfg(not(feature = "force_validate"))]
+/// # {
/// # use arrow::array::StringArray;
/// # use arrow::compute::kernels::substring::substring;
/// let array = StringArray::from(vec![Some("E=mc²")]);
/// let result = substring(&array, -1, &None).unwrap();
/// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
/// assert_eq!(result.value(0).as_bytes(), &[0x00B2]); // invalid utf-8 format
+/// # }
/// ```
///
/// # Error
diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs
index 19579968d..1134ed425 100644
--- a/arrow/src/ipc/reader.rs
+++ b/arrow/src/ipc/reader.rs
@@ -1107,6 +1107,8 @@ mod tests {
}
#[test]
+ // https://github.com/apache/arrow-rs/issues/1548
+ #[cfg(not(feature = "force_validate"))]
fn projection_should_work() {
// complementary to the previous test
let testdata = crate::util::test_util::arrow_test_data();