You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2022/05/15 02:36:08 UTC
[arrow-rs] branch master updated: Speed up the offsets checking (#1684)
This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ff182f167 Speed up the offsets checking (#1684)
ff182f167 is described below
commit ff182f167e83b1fc553e213ca208bb82a28dea91
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Sun May 15 10:36:03 2022 +0800
Speed up the offsets checking (#1684)
* simplify offsets fully checking
Signed-off-by: remzi <13...@gmail.com>
* update doc
rename to offsets buffer
Signed-off-by: remzi <13...@gmail.com>
* add more tests for empty binary-like array
Signed-off-by: remzi <13...@gmail.com>
* add docs for empty binary-like array
Signed-off-by: remzi <13...@gmail.com>
* Update arrow/src/array/data.rs
fix a nit in docs.
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
* add benchmark
Signed-off-by: remzi <13...@gmail.com>
* replace windows with scan
Signed-off-by: remzi <13...@gmail.com>
* Update arrow/src/array/data.rs
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
* Update arrow/src/array/data.rs
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
arrow/Cargo.toml | 4 +
arrow/benches/array_data_validate.rs | 48 +++++++++
arrow/src/array/data.rs | 196 ++++++++++++++++++++++++-----------
3 files changed, 187 insertions(+), 61 deletions(-)
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index cc6d5a4f7..5658b7ae5 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -176,3 +176,7 @@ harness = false
[[bench]]
name = "string_kernels"
harness = false
+
+[[bench]]
+name = "array_data_validate"
+harness = false
diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
new file mode 100644
index 000000000..32e548a29
--- /dev/null
+++ b/arrow/benches/array_data_validate.rs
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::{array::*, buffer::Buffer, datatypes::DataType};
+
+fn create_binary_array_data(length: i32) -> ArrayData {
+ let value_buffer = Buffer::from_iter(0_i32..length);
+ let offsets_buffer = Buffer::from_iter(0_i32..length + 1);
+ ArrayData::try_new(
+ DataType::Binary,
+ length as usize,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, value_buffer],
+ vec![],
+ )
+ .unwrap()
+}
+
+fn array_slice_benchmark(c: &mut Criterion) {
+ c.bench_function("validate_binary_array_data 20000", |b| {
+ b.iter(|| create_binary_array_data(20000))
+ });
+}
+
+criterion_group!(benches, array_slice_benchmark);
+criterion_main!(benches);
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index c0ecef75d..f7eacb0a4 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -726,19 +726,21 @@ impl ArrayData {
/// Returns a reference to the data in `buffer` as a typed slice
/// (typically `&[i32]` or `&[i64]`) after validating. The
/// returned slice is guaranteed to have at least `self.len + 1`
- /// entries
+ /// entries.
+ ///
+ /// For an empty array, the `buffer` can also be empty.
fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>(
&'a self,
buffer: &'a Buffer,
) -> Result<&'a [T]> {
- // Validate that there are the correct number of offsets for this array's length
- let required_offsets = self.len + self.offset + 1;
-
// An empty list-like array can have 0 offsets
- if buffer.is_empty() {
+ if buffer.is_empty() && self.len == 0 {
return Ok(&[]);
}
+ // Validate that there are the correct number of offsets for this array's length
+ let required_offsets = self.len + self.offset + 1;
+
if (buffer.len() / std::mem::size_of::<T>()) < required_offsets {
return Err(ArrowError::InvalidArgumentError(format!(
"Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}",
@@ -1033,15 +1035,18 @@ impl ArrayData {
}
/// Calls the `validate(item_index, range)` function for each of
- /// the ranges specified in the arrow offset buffer of type
+ /// the ranges specified in the arrow offsets buffer of type
/// `T`. Also validates that each offset is smaller than
- /// `max_offset`
+ /// `offset_limit`
+ ///
+ /// For an empty array, the offsets buffer can either be empty
+ /// or contain a single `0`.
///
- /// For example, the offset buffer contained `[1, 2, 4]`, this
+ /// For example, the offsets buffer contained `[1, 2, 4]`, this
/// function would call `validate([1,2])`, and `validate([2,4])`
fn validate_each_offset<T, V>(
&self,
- offset_buffer: &Buffer,
+ offsets_buffer: &Buffer,
offset_limit: usize,
validate: V,
) -> Result<()>
@@ -1049,60 +1054,45 @@ impl ArrayData {
T: ArrowNativeType + std::convert::TryInto<usize> + num::Num + std::fmt::Display,
V: Fn(usize, Range<usize>) -> Result<()>,
{
- // An empty binary-like array can have 0 offsets
- if self.len == 0 && offset_buffer.is_empty() {
- return Ok(());
- }
-
- let offsets = self.typed_offsets::<T>(offset_buffer)?;
-
- offsets
+ self.typed_offsets::<T>(offsets_buffer)?
.iter()
- .zip(offsets.iter().skip(1))
.enumerate()
- .map(|(i, (&start_offset, &end_offset))| {
- let start_offset: usize = start_offset
- .try_into()
- .map_err(|_| {
- ArrowError::InvalidArgumentError(format!(
- "Offset invariant failure: could not convert start_offset {} to usize in slot {}",
- start_offset, i))
- })?;
- let end_offset: usize = end_offset
- .try_into()
- .map_err(|_| {
- ArrowError::InvalidArgumentError(format!(
- "Offset invariant failure: Could not convert end_offset {} to usize in slot {}",
- end_offset, i+1))
- })?;
-
- if start_offset > offset_limit {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Offset invariant failure: offset for slot {} out of bounds: {} > {}",
- i, start_offset, offset_limit))
- );
- }
-
- if end_offset > offset_limit {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Offset invariant failure: offset for slot {} out of bounds: {} > {}",
- i, end_offset, offset_limit))
+ .map(|(i, x)| {
+ // check if the offset can be converted to usize
+ let r = x.to_usize().ok_or_else(|| {
+ ArrowError::InvalidArgumentError(format!(
+ "Offset invariant failure: Could not convert offset {} to usize at position {}",
+ x, i))}
);
+ // check if the offset exceeds the limit
+ match r {
+ Ok(n) if n <= offset_limit => Ok((i, n)),
+ Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
+ "Offset invariant failure: offset at position {} out of bounds: {} > {}",
+ i, x, offset_limit))
+ ),
+ Err(e) => Err(e),
}
-
- // check range actually is low -> high
- if start_offset > end_offset {
- return Err(ArrowError::InvalidArgumentError(format!(
+ })
+ .scan(0_usize, |start, end| {
+ // check offsets are monotonically increasing
+ match end {
+ Ok((i, end)) if *start <= end => {
+ let range = Some(Ok((i, *start..end)));
+ *start = end;
+ range
+ }
+ Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
"Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
- i, start_offset, end_offset))
- );
+ i - 1, start, end))
+ )),
+ Err(err) => Some(Err(err)),
}
-
- Ok((i, start_offset..end_offset))
})
+ .skip(1) // the first element is meaningless
.try_for_each(|res: Result<(usize, Range<usize>)>| {
let (item_index, range) = res?;
- validate(item_index, range)
+ validate(item_index-1, range)
})
}
@@ -1821,6 +1811,90 @@ mod tests {
.unwrap();
}
+ #[test]
+ fn test_empty_utf8_array_with_empty_offsets_buffer() {
+ let data_buffer = Buffer::from(&[]);
+ let offsets_buffer = Buffer::from(&[]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 0,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ fn test_empty_utf8_array_with_single_zero_offset() {
+ let data_buffer = Buffer::from(&[]);
+ let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 0,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")]
+ fn test_empty_utf8_array_with_invalid_offset() {
+ let data_buffer = Buffer::from(&[]);
+ let offsets_buffer = Buffer::from_slice_ref(&[1i32]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 0,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ fn test_empty_utf8_array_with_non_zero_offset() {
+ let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+ let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]);
+ ArrayData::try_new(
+ DataType::Utf8,
+ 0,
+ None,
+ None,
+ 3,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Offsets buffer size (bytes): 4 isn't large enough for LargeUtf8. Length 0 needs 1"
+ )]
+ fn test_empty_large_utf8_array_with_wrong_type_offsets() {
+ let data_buffer = Buffer::from(&[]);
+ let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
+ ArrayData::try_new(
+ DataType::LargeUtf8,
+ 0,
+ None,
+ None,
+ 0,
+ vec![offsets_buffer, data_buffer],
+ vec![],
+ )
+ .unwrap();
+ }
+
#[test]
#[should_panic(
expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3"
@@ -2110,7 +2184,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+ expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_utf8_out_of_bounds() {
check_index_out_of_bounds_validation::<i32>(DataType::Utf8);
@@ -2118,7 +2192,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+ expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_large_utf8_out_of_bounds() {
check_index_out_of_bounds_validation::<i64>(DataType::LargeUtf8);
@@ -2126,7 +2200,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+ expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_binary_out_of_bounds() {
check_index_out_of_bounds_validation::<i32>(DataType::Binary);
@@ -2134,7 +2208,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4"
+ expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4"
)]
fn test_validate_large_binary_out_of_bounds() {
check_index_out_of_bounds_validation::<i64>(DataType::LargeBinary);
@@ -2327,7 +2401,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4"
+ expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4"
)]
fn test_validate_list_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
@@ -2336,7 +2410,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4"
+ expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4"
)]
fn test_validate_large_list_offsets() {
let field_type = Field::new("f", DataType::Int32, true);
@@ -2346,7 +2420,7 @@ mod tests {
/// Test that the list of type `data_type` generates correct errors for negative offsets
#[test]
#[should_panic(
- expected = "Offset invariant failure: Could not convert end_offset -1 to usize in slot 2"
+ expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2"
)]
fn test_validate_list_negative_offsets() {
let values: Int32Array =