You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/24 09:47:27 UTC

[arrow-rs] branch master updated: Faster BinaryArray to StringArray conversion (#3168)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 8ba78427e Faster BinaryArray to StringArray conversion (#3168)
8ba78427e is described below

commit 8ba78427ef2fea52ffabe91104b74b17906b3772
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Nov 24 09:47:21 2022 +0000

    Faster BinaryArray to StringArray conversion (#3168)
    
    * Faster ByteArray to StringArray conversion
    
    * Add benchmark
    
    * Fix logical conflict
---
 arrow-array/src/array/string_array.rs | 16 +++++++++++++++-
 arrow/benches/array_data_validate.rs  |  6 ++++++
 arrow/src/row/mod.rs                  |  2 +-
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index 8d92093f5..fb3bb2317 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -216,8 +216,22 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
     for GenericStringArray<OffsetSize>
 {
     fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
+        let offsets = v.value_offsets();
+        let values = v.value_data();
+
+        // We only need to validate that all values are valid UTF-8
+        let validated = std::str::from_utf8(values).expect("Invalid UTF-8 sequence");
+        for offset in offsets.iter() {
+            assert!(
+                validated.is_char_boundary(offset.as_usize()),
+                "Invalid UTF-8 sequence"
+            )
+        }
+
         let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
-        Self::from(builder.build().unwrap())
+        // SAFETY:
+        // Validated UTF-8 above
+        Self::from(unsafe { builder.build_unchecked() })
     }
 }
 
diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
index 3cd13c09c..3b0fdbe63 100644
--- a/arrow/benches/array_data_validate.rs
+++ b/arrow/benches/array_data_validate.rs
@@ -52,6 +52,12 @@ fn validate_benchmark(c: &mut Criterion) {
     c.bench_function("validate_utf8_array_data 20000", |b| {
         b.iter(|| validate_utf8_array(&str_arr))
     });
+
+    let byte_array =
+        BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000));
+    c.bench_function("byte_array_to_string_array 20000", |b| {
+        b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone())))
+    });
 }
 
 criterion_group!(benches, validate_benchmark);
diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
index 058c35869..6ce9f2b12 100644
--- a/arrow/src/row/mod.rs
+++ b/arrow/src/row/mod.rs
@@ -1425,7 +1425,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "Invalid UTF8 sequence at string")]
+    #[should_panic(expected = "Invalid UTF-8 sequence")]
     fn test_invalid_utf8() {
         let mut converter =
             RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();