You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/24 09:47:27 UTC
[arrow-rs] branch master updated: Faster BinaryArray to StringArray conversion (#3168)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8ba78427e Faster BinaryArray to StringArray conversion (#3168)
8ba78427e is described below
commit 8ba78427ef2fea52ffabe91104b74b17906b3772
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Nov 24 09:47:21 2022 +0000
Faster BinaryArray to StringArray conversion (#3168)
* Faster ByteArray to StringArray conversion
* Add benchmark
* Fix logical conflict
---
arrow-array/src/array/string_array.rs | 16 +++++++++++++++-
arrow/benches/array_data_validate.rs | 6 ++++++
arrow/src/row/mod.rs | 2 +-
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index 8d92093f5..fb3bb2317 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -216,8 +216,22 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
+ let offsets = v.value_offsets();
+ let values = v.value_data();
+
+ // We only need to validate that all values are valid UTF-8
+ let validated = std::str::from_utf8(values).expect("Invalid UTF-8 sequence");
+ for offset in offsets.iter() {
+ assert!(
+ validated.is_char_boundary(offset.as_usize()),
+ "Invalid UTF-8 sequence"
+ )
+ }
+
let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
- Self::from(builder.build().unwrap())
+ // SAFETY:
+ // Validated UTF-8 above
+ Self::from(unsafe { builder.build_unchecked() })
}
}
diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
index 3cd13c09c..3b0fdbe63 100644
--- a/arrow/benches/array_data_validate.rs
+++ b/arrow/benches/array_data_validate.rs
@@ -52,6 +52,12 @@ fn validate_benchmark(c: &mut Criterion) {
c.bench_function("validate_utf8_array_data 20000", |b| {
b.iter(|| validate_utf8_array(&str_arr))
});
+
+ let byte_array =
+ BinaryArray::from_iter_values(std::iter::repeat(b"test").take(20000));
+ c.bench_function("byte_array_to_string_array 20000", |b| {
+ b.iter(|| StringArray::from(BinaryArray::from(byte_array.data().clone())))
+ });
}
criterion_group!(benches, validate_benchmark);
diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
index 058c35869..6ce9f2b12 100644
--- a/arrow/src/row/mod.rs
+++ b/arrow/src/row/mod.rs
@@ -1425,7 +1425,7 @@ mod tests {
}
#[test]
- #[should_panic(expected = "Invalid UTF8 sequence at string")]
+ #[should_panic(expected = "Invalid UTF-8 sequence")]
fn test_invalid_utf8() {
let mut converter =
RowConverter::new(vec![SortField::new(DataType::Binary)]).unwrap();