You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/26 15:20:59 UTC
[arrow-rs] branch master updated: Improve `validate_utf8` performance (#2048)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0c640544f Improve `validate_utf8` performance (#2048)
0c640544f is described below
commit 0c640544fc484721812267978dd90346264ea000
Author: Trent Feda <36...@users.noreply.github.com>
AuthorDate: Tue Jul 26 11:20:55 2022 -0400
Improve `validate_utf8` performance (#2048)
* added utf8 validation bench
* improve utf8 validation performance
* fix bench clippy errors
* Add is_char_boundary() to utf8 validation
---
arrow/benches/array_data_validate.rs | 15 +++++++++++--
arrow/src/array/data.rs | 41 +++++++++++++++++++++++++++---------
2 files changed, 44 insertions(+), 12 deletions(-)
diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
index c46252bec..3cd13c09c 100644
--- a/arrow/benches/array_data_validate.rs
+++ b/arrow/benches/array_data_validate.rs
@@ -37,11 +37,22 @@ fn create_binary_array_data(length: i32) -> ArrayData {
.unwrap()
}
-fn array_slice_benchmark(c: &mut Criterion) {
+fn validate_utf8_array(arr: &StringArray) {
+ arr.data().validate_values().unwrap();
+}
+
+fn validate_benchmark(c: &mut Criterion) {
+ //Binary Array
c.bench_function("validate_binary_array_data 20000", |b| {
b.iter(|| create_binary_array_data(20000))
});
+
+ //Utf8 Array
+ let str_arr = StringArray::from(vec!["test"; 20000]);
+ c.bench_function("validate_utf8_array_data 20000", |b| {
+ b.iter(|| validate_utf8_array(&str_arr))
+ });
}
-criterion_group!(benches, array_slice_benchmark);
+criterion_group!(benches, validate_benchmark);
criterion_main!(benches);
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 4ae7f069e..c38107b25 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -1141,16 +1141,37 @@ impl ArrayData {
T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
{
let values_buffer = &self.buffers[1].as_slice();
-
- self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
- std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
- ArrowError::InvalidArgumentError(format!(
- "Invalid UTF8 sequence at string index {} ({:?}): {}",
- string_index, range, e
- ))
- })?;
- Ok(())
- })
+ if let Ok(values_str) = std::str::from_utf8(values_buffer) {
+ // Validate Offsets are correct
+ self.validate_each_offset::<T, _>(
+ values_buffer.len(),
+ |string_index, range| {
+ if !values_str.is_char_boundary(range.start)
+ || !values_str.is_char_boundary(range.end)
+ {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "incomplete utf-8 byte sequence from index {}",
+ string_index
+ )));
+ }
+ Ok(())
+ },
+ )
+ } else {
+ // find specific offset that failed utf8 validation
+ self.validate_each_offset::<T, _>(
+ values_buffer.len(),
+ |string_index, range| {
+ std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
+ ArrowError::InvalidArgumentError(format!(
+ "Invalid UTF8 sequence at string index {} ({:?}): {}",
+ string_index, range, e
+ ))
+ })?;
+ Ok(())
+ },
+ )
+ }
}
/// Ensures that all offsets in `buffers[0]` into `buffers[1]` are