You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/25 23:30:08 UTC

[arrow-rs] branch master updated: Faster ListArray to StringArray conversion (#3593)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 1afefbbf1 Faster ListArray to StringArray conversion (#3593)
1afefbbf1 is described below

commit 1afefbbf102b73ad1308da9fbf9e0bc4850ddde7
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Jan 25 23:30:02 2023 +0000

    Faster ListArray to StringArray conversion (#3593)
---
 arrow-array/src/array/string_array.rs | 65 +++++++++--------------------------
 1 file changed, 16 insertions(+), 49 deletions(-)

diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
index 926bcc7bf..14db33882 100644
--- a/arrow-array/src/array/string_array.rs
+++ b/arrow-array/src/array/string_array.rs
@@ -45,50 +45,6 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
         self.value(i).chars().count()
     }
 
-    /// Convert a list array to a string array.
-    ///
-    /// Note: this performs potentially expensive UTF-8 validation, consider using
-    /// [`StringBuilder`][crate::builder::StringBuilder] to avoid this
-    ///
-    /// # Panics
-    ///
-    /// This method panics if the array contains non-UTF-8 data
-    fn from_list(v: GenericListArray<OffsetSize>) -> Self {
-        assert_eq!(
-            v.data_ref().child_data().len(),
-            1,
-            "StringArray can only be created from list array of u8 values \
-             (i.e. List<PrimitiveArray<u8>>)."
-        );
-        let child_data = &v.data_ref().child_data()[0];
-
-        assert_eq!(
-            child_data.child_data().len(),
-            0,
-            "StringArray can only be created from list array of u8 values \
-             (i.e. List<PrimitiveArray<u8>>)."
-        );
-        assert_eq!(
-            child_data.data_type(),
-            &DataType::UInt8,
-            "StringArray can only be created from List<u8> arrays, mismatched data types."
-        );
-        assert_eq!(
-            child_data.null_count(),
-            0,
-            "The child array cannot contain null values."
-        );
-
-        let builder = ArrayData::builder(Self::DATA_TYPE)
-            .len(v.len())
-            .offset(v.offset())
-            .add_buffer(v.data().buffers()[0].clone())
-            .add_buffer(child_data.buffers()[0].slice(child_data.offset()))
-            .null_bit_buffer(v.data().null_buffer().cloned());
-
-        Self::from(builder.build().unwrap())
-    }
-
     /// Creates a [`GenericStringArray`] based on an iterator of values without nulls
     pub fn from_iter_values<Ptr, I>(iter: I) -> Self
     where
@@ -208,7 +164,7 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
     for GenericStringArray<OffsetSize>
 {
     fn from(v: GenericListArray<OffsetSize>) -> Self {
-        GenericStringArray::<OffsetSize>::from_list(v)
+        GenericBinaryArray::<OffsetSize>::from(v).into()
     }
 }
 
@@ -290,7 +246,8 @@ pub type LargeStringArray = GenericStringArray<i64>;
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::builder::{ListBuilder, StringBuilder};
+    use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
+    use crate::types::UInt8Type;
     use arrow_buffer::Buffer;
     use arrow_schema::Field;
 
@@ -678,7 +635,7 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "StringArray can only be created from List<u8> arrays, mismatched data types."
+        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
     )]
     fn test_string_array_from_list_array_wrong_type() {
         _test_generic_string_array_from_list_array_wrong_type::<i32>();
@@ -686,10 +643,20 @@ mod tests {
 
     #[test]
     #[should_panic(
-        expected = "StringArray can only be created from List<u8> arrays, mismatched data types."
+        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
     )]
     fn test_large_string_array_from_list_array_wrong_type() {
-        _test_generic_string_array_from_list_array_wrong_type::<i32>();
+        _test_generic_string_array_from_list_array_wrong_type::<i64>();
+    }
+
+    #[test]
+    #[should_panic(expected = "Invalid UTF-8 sequence: Utf8Error")]
+    fn test_list_array_utf8_validation() {
+        let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
+        builder.values().append_value(0xFF);
+        builder.append(true);
+        let list = builder.finish();
+        let _ = StringArray::from(list);
     }
 
     #[test]