You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2023/01/28 08:13:35 UTC

[arrow-rs] branch master updated: Casting generic binary to generic string (#3607)

This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 8cc832769 Casting generic binary to generic string (#3607)
8cc832769 is described below

commit 8cc8327696e5f1bd5e647ab7e9fc874abf938b6d
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Sat Jan 28 00:13:29 2023 -0800

    Casting generic binary to generic string (#3607)
    
    * Casting generic binary to generic string
    
    * For CastOptions.safe as false case, applying optimized casting
    
    * Remove offset
---
 arrow-cast/src/cast.rs | 129 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 78 insertions(+), 51 deletions(-)

diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index aec665aa3..9f20dceb9 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -156,8 +156,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
 
         (Utf8, LargeUtf8) => true,
         (LargeUtf8, Utf8) => true,
-        (Binary, LargeBinary) => true,
-        (LargeBinary, Binary) => true,
+        (Binary, LargeBinary | Utf8 | LargeUtf8) => true,
+        (LargeBinary, Binary | Utf8 | LargeUtf8) => true,
         (Utf8,
             Binary
             | LargeBinary
@@ -185,7 +185,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
         (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true,
         (Date32, Utf8) | (Date32, LargeUtf8) => true,
         (Date64, Utf8) | (Date64, LargeUtf8) => true,
-        (_, Utf8 | LargeUtf8) => (DataType::is_numeric(from_type) && from_type != &Float16) || from_type == &Binary,
+        (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) && from_type != &Float16,
 
         // start numeric casts
         (
@@ -1180,30 +1180,8 @@ pub fn cast_with_options(
             }
             Date32 => cast_date32_to_string::<i32>(array),
             Date64 => cast_date64_to_string::<i32>(array),
-            Binary => {
-                let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
-                Ok(Arc::new(
-                    array
-                        .iter()
-                        .map(|maybe_value| match maybe_value {
-                            Some(value) => {
-                                let result = std::str::from_utf8(value);
-                                if cast_options.safe {
-                                    Ok(result.ok())
-                                } else {
-                                    Some(result.map_err(|_| {
-                                        ArrowError::CastError(
-                                            "Cannot cast binary to string".to_string(),
-                                        )
-                                    }))
-                                    .transpose()
-                                }
-                            }
-                            None => Ok(None),
-                        })
-                        .collect::<Result<StringArray, _>>()?,
-                ))
-            }
+            Binary => cast_binary_to_generic_string::<i32, i32>(array, cast_options),
+            LargeBinary => cast_binary_to_generic_string::<i64, i32>(array, cast_options),
             _ => Err(ArrowError::CastError(format!(
                 "Casting from {from_type:?} to {to_type:?} not supported",
             ))),
@@ -1236,30 +1214,8 @@ pub fn cast_with_options(
             }
             Date32 => cast_date32_to_string::<i64>(array),
             Date64 => cast_date64_to_string::<i64>(array),
-            Binary => {
-                let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
-                Ok(Arc::new(
-                    array
-                        .iter()
-                        .map(|maybe_value| match maybe_value {
-                            Some(value) => {
-                                let result = std::str::from_utf8(value);
-                                if cast_options.safe {
-                                    Ok(result.ok())
-                                } else {
-                                    Some(result.map_err(|_| {
-                                        ArrowError::CastError(
-                                            "Cannot cast binary to string".to_string(),
-                                        )
-                                    }))
-                                    .transpose()
-                                }
-                            }
-                            None => Ok(None),
-                        })
-                        .collect::<Result<LargeStringArray, _>>()?,
-                ))
-            }
+            Binary => cast_binary_to_generic_string::<i32, i64>(array, cast_options),
+            LargeBinary => cast_binary_to_generic_string::<i64, i64>(array, cast_options),
             _ => Err(ArrowError::CastError(format!(
                 "Casting from {from_type:?} to {to_type:?} not supported",
             ))),
@@ -3436,6 +3392,77 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
     Ok(Arc::new(list) as ArrayRef)
 }
 
+/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs
+/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending
+/// `CastOptions`.
+fn cast_binary_to_generic_string<I, O>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+    I: OffsetSizeTrait + ToPrimitive,
+    O: OffsetSizeTrait + NumCast,
+{
+    let array = array
+        .as_any()
+        .downcast_ref::<GenericByteArray<GenericBinaryType<I>>>()
+        .unwrap();
+
+    if !cast_options.safe {
+        let offsets = array.value_offsets();
+        let values = array.value_data();
+
+        // We only need to validate that all values are valid UTF-8
+        let validated = std::str::from_utf8(values)
+            .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?;
+
+        let mut offset_builder = BufferBuilder::<O>::new(offsets.len());
+        offsets
+            .iter()
+            .try_for_each::<_, Result<_, ArrowError>>(|offset| {
+                if !validated.is_char_boundary(offset.as_usize()) {
+                    return Err(ArrowError::CastError(
+                        "Invalid UTF-8 sequence".to_string(),
+                    ));
+                }
+
+                let offset = <O as NumCast>::from(*offset).ok_or_else(|| {
+                    ArrowError::ComputeError(format!(
+                        "{}Binary array too large to cast to {}String array",
+                        I::PREFIX,
+                        O::PREFIX
+                    ))
+                })?;
+                offset_builder.append(offset);
+                Ok(())
+            })?;
+
+        let offset_buffer = offset_builder.finish();
+
+        let builder = ArrayData::builder(GenericStringArray::<O>::DATA_TYPE)
+            .len(array.len())
+            .add_buffer(offset_buffer)
+            .add_buffer(array.data().buffers()[1].clone())
+            .null_count(array.null_count())
+            .null_bit_buffer(array.data().null_buffer().cloned());
+
+        // SAFETY:
+        // Validated UTF-8 above
+        Ok(Arc::new(GenericStringArray::<O>::from(unsafe {
+            builder.build_unchecked()
+        })))
+    } else {
+        Ok(Arc::new(
+            array
+                .iter()
+                .map(|maybe_value| {
+                    maybe_value.and_then(|value| std::str::from_utf8(value).ok())
+                })
+                .collect::<GenericByteArray<GenericStringType<O>>>(),
+        ))
+    }
+}
+
 /// Helper function to cast from one `ByteArrayType` to another and vice versa.
 /// If the target one (e.g., `LargeUtf8`) is too large for the source array it will return an Error.
 fn cast_byte_container<FROM, TO, N: ?Sized>(