You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/26 15:20:59 UTC

[arrow-rs] branch master updated: Improve `validate_utf8` performance (#2048)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c640544f Improve `validate_utf8` performance (#2048)
0c640544f is described below

commit 0c640544fc484721812267978dd90346264ea000
Author: Trent Feda <36...@users.noreply.github.com>
AuthorDate: Tue Jul 26 11:20:55 2022 -0400

    Improve `validate_utf8` performance (#2048)
    
    * added utf8 validation bench
    
    * improve utf8 validation performance
    
    * fix bench clippy errors
    
    * Add is_char_boundary() to utf8 validation
---
 arrow/benches/array_data_validate.rs | 15 +++++++++++--
 arrow/src/array/data.rs              | 41 +++++++++++++++++++++++++++---------
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
index c46252bec..3cd13c09c 100644
--- a/arrow/benches/array_data_validate.rs
+++ b/arrow/benches/array_data_validate.rs
@@ -37,11 +37,22 @@ fn create_binary_array_data(length: i32) -> ArrayData {
     .unwrap()
 }
 
-fn array_slice_benchmark(c: &mut Criterion) {
+fn validate_utf8_array(arr: &StringArray) {
+    arr.data().validate_values().unwrap();
+}
+
+fn validate_benchmark(c: &mut Criterion) {
+    //Binary Array
     c.bench_function("validate_binary_array_data 20000", |b| {
         b.iter(|| create_binary_array_data(20000))
     });
+
+    //Utf8 Array
+    let str_arr = StringArray::from(vec!["test"; 20000]);
+    c.bench_function("validate_utf8_array_data 20000", |b| {
+        b.iter(|| validate_utf8_array(&str_arr))
+    });
 }
 
-criterion_group!(benches, array_slice_benchmark);
+criterion_group!(benches, validate_benchmark);
 criterion_main!(benches);
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
index 4ae7f069e..c38107b25 100644
--- a/arrow/src/array/data.rs
+++ b/arrow/src/array/data.rs
@@ -1141,16 +1141,37 @@ impl ArrayData {
         T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
     {
         let values_buffer = &self.buffers[1].as_slice();
-
-        self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
-            std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
-                ArrowError::InvalidArgumentError(format!(
-                    "Invalid UTF8 sequence at string index {} ({:?}): {}",
-                    string_index, range, e
-                ))
-            })?;
-            Ok(())
-        })
+        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
+            // Validate Offsets are correct
+            self.validate_each_offset::<T, _>(
+                values_buffer.len(),
+                |string_index, range| {
+                    if !values_str.is_char_boundary(range.start)
+                        || !values_str.is_char_boundary(range.end)
+                    {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "incomplete utf-8 byte sequence from index {}",
+                            string_index
+                        )));
+                    }
+                    Ok(())
+                },
+            )
+        } else {
+            // find specific offset that failed utf8 validation
+            self.validate_each_offset::<T, _>(
+                values_buffer.len(),
+                |string_index, range| {
+                    std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
+                        ArrowError::InvalidArgumentError(format!(
+                            "Invalid UTF8 sequence at string index {} ({:?}): {}",
+                            string_index, range, e
+                        ))
+                    })?;
+                    Ok(())
+                },
+            )
+        }
     }
 
     /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are