You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2023/01/31 17:32:41 UTC
[arrow-rs] branch master updated: Specified version of helper function to cast binary to string (#3624)
This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new dd168114a Specified version of helper function to cast binary to string (#3624)
dd168114a is described below
commit dd168114a92fc2dc61c74415bbe8ea0a2f5a99ce
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Tue Jan 31 09:32:35 2023 -0800
Specified version of helper function to cast binary to string (#3624)
* Specified version of helper function to cast binary to string
* Simplify it
---
arrow-cast/src/cast.rs | 65 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 63 insertions(+), 2 deletions(-)
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index 9f20dceb9..c0082b347 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -1180,7 +1180,7 @@ pub fn cast_with_options(
}
Date32 => cast_date32_to_string::<i32>(array),
Date64 => cast_date64_to_string::<i32>(array),
- Binary => cast_binary_to_generic_string::<i32, i32>(array, cast_options),
+ Binary => cast_binary_to_string::<i32>(array, cast_options),
LargeBinary => cast_binary_to_generic_string::<i64, i32>(array, cast_options),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
@@ -1215,7 +1215,7 @@ pub fn cast_with_options(
Date32 => cast_date32_to_string::<i64>(array),
Date64 => cast_date64_to_string::<i64>(array),
Binary => cast_binary_to_generic_string::<i32, i64>(array, cast_options),
- LargeBinary => cast_binary_to_generic_string::<i64, i64>(array, cast_options),
+ LargeBinary => cast_binary_to_string::<i64>(array, cast_options),
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
@@ -3392,6 +3392,66 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list) as ArrayRef)
}
+/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
+/// offset size so re-encoding offset is unnecessary.
+fn cast_binary_to_string<O>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+ O: OffsetSizeTrait + ToPrimitive,
+{
+ let array = array
+ .as_any()
+ .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
+ .unwrap();
+
+ if !cast_options.safe {
+ let offsets = array.value_offsets();
+ let values = array.value_data();
+
+ // We only need to validate that all values are valid UTF-8
+ let validated = std::str::from_utf8(values)
+ .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?;
+ // Checks if the offsets are valid but does not re-encode
+ for offset in offsets.iter() {
+ if !validated.is_char_boundary(offset.as_usize()) {
+ return Err(ArrowError::CastError("Invalid UTF-8 sequence".to_string()));
+ }
+ }
+
+ let builder = array
+ .into_data()
+ .into_builder()
+ .data_type(GenericStringArray::<O>::DATA_TYPE);
+ // SAFETY:
+ // Validated UTF-8 above
+ Ok(Arc::new(GenericStringArray::<O>::from(unsafe {
+ builder.build_unchecked()
+ })))
+ } else {
+ let mut null_builder = BooleanBufferBuilder::new(array.len());
+ array.iter().for_each(|maybe_value| {
+ null_builder.append(
+ maybe_value
+ .and_then(|value| std::str::from_utf8(value).ok())
+ .is_some(),
+ );
+ });
+
+ let builder = array
+ .into_data()
+ .into_builder()
+ .null_bit_buffer(Some(null_builder.finish()))
+ .data_type(GenericStringArray::<O>::DATA_TYPE);
+ // SAFETY:
+ // Validated UTF-8 above
+ Ok(Arc::new(GenericStringArray::<O>::from(unsafe {
+ builder.build_unchecked()
+ })))
+ }
+}
+
/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs
/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending
/// `CastOptions`.
@@ -3417,6 +3477,7 @@ where
.map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?;
let mut offset_builder = BufferBuilder::<O>::new(offsets.len());
+ // Checks if the offset is a valid char boundary and re-encode the offset
offsets
.iter()
.try_for_each::<_, Result<_, ArrowError>>(|offset| {