You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2022/05/15 02:36:08 UTC

[arrow-rs] branch master updated: Speed up the offsets checking (#1684)

This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new ff182f167 Speed up the offsets checking (#1684)
ff182f167 is described below

commit ff182f167e83b1fc553e213ca208bb82a28dea91
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Sun May 15 10:36:03 2022 +0800

    Speed up the offsets checking (#1684)
    
    * simplify offsets fully checking
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * update doc
    rename to offsets buffer
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add more tests for empty binary-like array
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add docs for empty binary-like array
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * Update arrow/src/array/data.rs
    
    fix a nit in docs.
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    * add benchmark
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * replace windows with scan
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * Update arrow/src/array/data.rs
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    * Update arrow/src/array/data.rs
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 arrow/Cargo.toml                     |   4 +
 arrow/benches/array_data_validate.rs |  48 +++++++++
 arrow/src/array/data.rs              | 196 ++++++++++++++++++++++++-----------
 3 files changed, 187 insertions(+), 61 deletions(-)

diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index cc6d5a4f7..5658b7ae5 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -176,3 +176,7 @@ harness = false
 [[bench]]
 name = "string_kernels"
 harness = false
+
+[[bench]]
+name = "array_data_validate"
+harness = false
diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
new file mode 100644
index 000000000..32e548a29
--- /dev/null
+++ b/arrow/benches/array_data_validate.rs
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::{array::*, buffer::Buffer, datatypes::DataType};
+
+fn create_binary_array_data(length: i32) -> ArrayData {
+    let value_buffer = Buffer::from_iter(0_i32..length);
+    let offsets_buffer = Buffer::from_iter(0_i32..length + 1);
+    ArrayData::try_new(
+        DataType::Binary,
+        length as usize,
+        None,
+        None,
+        0,
+        vec![offsets_buffer, value_buffer],
+        vec![],
+    )
+    .unwrap()
+}
+
+fn array_slice_benchmark(c: &mut Criterion) {
+    c.bench_function("validate_binary_array_data 20000", |b| {
+        b.iter(|| create_binary_array_data(20000))
+    });
+}
+
+criterion_group!(benches, array_slice_benchmark);
+criterion_main!(benches);
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index c0ecef75d..f7eacb0a4 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -726,19 +726,21 @@ impl ArrayData {
     /// Returns a reference to the data in `buffer` as a typed slice
     /// (typically `&[i32]` or `&[i64]`) after validating. The
     /// returned slice is guaranteed to have at least `self.len + 1`
-    /// entries
+    /// entries.
+    ///
+    /// For an empty array, the `buffer` can also be empty.
     fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>(
         &'a self,
         buffer: &'a Buffer,
     ) -> Result<&'a [T]> {
-        // Validate that there are the correct number of offsets for this array's length
-        let required_offsets = self.len + self.offset + 1;
-
         // An empty list-like array can have 0 offsets
-        if buffer.is_empty() {
+        if buffer.is_empty() && self.len == 0 {
             return Ok(&[]);
         }
 
+        // Validate that there are the correct number of offsets for this array's length
+        let required_offsets = self.len + self.offset + 1;
+
         if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
             return Err(ArrowError::InvalidArgumentError(format!(
                 "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}",
@@ -1033,15 +1035,18 @@ impl ArrayData {
     }
 
     /// Calls the `validate(item_index, range)` function for each of
-    /// the ranges specified in the arrow offset buffer of type
+    /// the ranges specified in the arrow offsets buffer of type
     /// `T`. Also validates that each offset is smaller than
-    /// `max_offset`
+    /// `offset_limit`
+    ///
+    /// For an empty array, the offsets buffer can either be empty
+    /// or contain a single `0`.
     ///
-    /// For example, the offset buffer contained `[1, 2, 4]`, this
+    /// For example, the offsets buffer contained `[1, 2, 4]`, this
     /// function would call `validate([1,2])`, and `validate([2,4])`
     fn validate_each_offset<T, V>(
         &self,
-        offset_buffer: &Buffer,
+        offsets_buffer: &Buffer,
         offset_limit: usize,
         validate: V,
     ) -> Result<()>
@@ -1049,60 +1054,45 @@ impl ArrayData {
         T: ArrowNativeType + std::convert::TryInto<usize> + num::Num + std::fmt::Display,
         V: Fn(usize, Range<usize>) -> Result<()>,
     {
-        // An empty binary-like array can have 0 offsets
-        if self.len == 0 && offset_buffer.is_empty() {
-            return Ok(());
-        }
-
-        let offsets = self.typed_offsets::<T>(offset_buffer)?;
-
-        offsets
+        self.typed_offsets::<T>(offsets_buffer)?
             .iter()
-            .zip(offsets.iter().skip(1))
             .enumerate()
-            .map(|(i, (&start_offset, &end_offset))| {
-                let start_offset: usize = start_offset
-                    .try_into()
-                    .map_err(|_| {
-                        ArrowError::InvalidArgumentError(format!(
-                            "Offset invariant failure: could not convert start_offset {} to usize in slot {}",
-                            start_offset, i))
-                    })?;
-                let end_offset: usize = end_offset
-                    .try_into()
-                    .map_err(|_| {
-                        ArrowError::InvalidArgumentError(format!(
-                            "Offset invariant failure: Could not convert end_offset {} to usize in slot {}",
-                            end_offset, i+1))
-                    })?;
-
-                if start_offset > offset_limit {
-                    return Err(ArrowError::InvalidArgumentError(format!(
-                        "Offset invariant failure: offset for slot {} out of bounds: {} > {}",
-                        i, start_offset, offset_limit))
-                    );
-                }
-
-                if end_offset > offset_limit {
-                    return Err(ArrowError::InvalidArgumentError(format!(
-                        "Offset invariant failure: offset for slot {} out of bounds: {} > {}",
-                        i, end_offset, offset_limit))
+            .map(|(i, x)| {
+                // check if the offset can be converted to usize
+                let r = x.to_usize().ok_or_else(|| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "Offset invariant failure: Could not convert offset {} to usize at position {}",
+                        x, i))}
                     );
+                // check if the offset exceeds the limit
+                match r {
+                    Ok(n) if n <= offset_limit => Ok((i, n)),
+                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
+                        "Offset invariant failure: offset at position {} out of bounds: {} > {}",
+                        i, x, offset_limit))
+                    ),
+                    Err(e) => Err(e),
                 }
-
-                // check range actually is low -> high
-                if start_offset > end_offset {
-                    return Err(ArrowError::InvalidArgumentError(format!(
+            })
+            .scan(0_usize, |start, end| {
+                // check offsets are monotonically increasing
+                match end {
+                    Ok((i, end)) if *start <= end => {
+                        let range = Some(Ok((i, *start..end)));
+                        *start = end;
+                        range
+                    }
+                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
                         "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
-                        i, start_offset, end_offset))
-                    );
+                        i - 1, start, end))
+                    )),
+                    Err(err) => Some(Err(err)),
                 }
-
-                Ok((i, start_offset..end_offset))
             })
+            .skip(1) // the first element is meaningless
             .try_for_each(|res: Result<(usize, Range<usize>)>| {
                 let (item_index, range) = res?;
-                validate(item_index, range)
+                validate(item_index-1, range)
             })
     }
 
@@ -1821,6 +1811,90 @@ mod tests {
         .unwrap();
     }
 
+    #[test]
+    fn test_empty_utf8_array_with_empty_offsets_buffer() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from(&[]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_empty_utf8_array_with_single_zero_offset() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")]
+    fn test_empty_utf8_array_with_invalid_offset() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from_slice_ref(&[1i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_empty_utf8_array_with_non_zero_offset() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            3,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Offsets buffer size (bytes): 4 isn't large enough for LargeUtf8. Length 0 needs 1"
+    )]
+    fn test_empty_large_utf8_array_with_wrong_type_offsets() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
+        ArrayData::try_new(
+            DataType::LargeUtf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
     #[test]
     #[should_panic(
         expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3"
@@ -2110,7 +2184,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+        expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
     )]
     fn test_validate_utf8_out_of_bounds() {
         check_index_out_of_bounds_validation::<i32>(DataType::Utf8);
@@ -2118,7 +2192,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+        expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
     )]
     fn test_validate_large_utf8_out_of_bounds() {
         check_index_out_of_bounds_validation::<i64>(DataType::LargeUtf8);
@@ -2126,7 +2200,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+        expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
     )]
     fn test_validate_binary_out_of_bounds() {
         check_index_out_of_bounds_validation::<i32>(DataType::Binary);
@@ -2134,7 +2208,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+        expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
     )]
     fn test_validate_large_binary_out_of_bounds() {
         check_index_out_of_bounds_validation::<i64>(DataType::LargeBinary);
@@ -2327,7 +2401,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4"
+        expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4"
     )]
     fn test_validate_list_offsets() {
         let field_type = Field::new("f", DataType::Int32, true);
@@ -2336,7 +2410,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4"
+        expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4"
     )]
     fn test_validate_large_list_offsets() {
         let field_type = Field::new("f", DataType::Int32, true);
@@ -2346,7 +2420,7 @@ mod tests {
     /// Test that the list of type `data_type` generates correct errors for negative offsets
     #[test]
     #[should_panic(
-        expected = "Offset invariant failure: Could not convert end_offset -1 to usize in slot 2"
+        expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2"
     )]
     fn test_validate_list_negative_offsets() {
         let values: Int32Array =