You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/25 20:29:24 UTC
[arrow-rs] branch master updated: Add `substring` support for binary (#1608)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 3e933ea42 Add `substring` support for binary (#1608)
3e933ea42 is described below
commit 3e933ea42526cc768b91ae00acef69099c558920
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Tue Apr 26 04:29:20 2022 +0800
Add `substring` support for binary (#1608)
* add substring for binary
fix some test for string array
Signed-off-by: remzi <13...@gmail.com>
* fix another bug in test
Signed-off-by: remzi <13...@gmail.com>
* update doc
Signed-off-by: remzi <13...@gmail.com>
* add with_nulls tests
Signed-off-by: remzi <13...@gmail.com>
* add without_nulls tests
Signed-off-by: remzi <13...@gmail.com>
* fix clippy
Signed-off-by: remzi <13...@gmail.com>
---
arrow/src/compute/kernels/substring.rs | 329 +++++++++++++++++++++++++++++++--
1 file changed, 309 insertions(+), 20 deletions(-)
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index df05a73c0..b687757c4 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-//! Defines kernel to extract a substring of a \[Large\]StringArray
+//! Defines kernel to extract a substring of an Array
+//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray
use crate::buffer::MutableBuffer;
use crate::{array::*, buffer::Buffer};
@@ -25,7 +26,68 @@ use crate::{
};
use std::cmp::Ordering;
-fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
+fn binary_substring<OffsetSize: BinaryOffsetSizeTrait>(
+ array: &GenericBinaryArray<OffsetSize>,
+ start: OffsetSize,
+ length: Option<OffsetSize>,
+) -> Result<ArrayRef> {
+ let offsets = array.value_offsets();
+ let null_bit_buffer = array.data_ref().null_buffer().cloned();
+ let values = array.value_data();
+ let data = values.as_slice();
+ let zero = OffsetSize::zero();
+
+ // start and end offsets of all substrings
+ let mut new_starts_ends: Vec<(OffsetSize, OffsetSize)> =
+ Vec::with_capacity(array.len());
+ let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
+ let mut len_so_far = zero;
+ new_offsets.push(zero);
+
+ offsets.windows(2).for_each(|pair| {
+ let new_start = match start.cmp(&zero) {
+ Ordering::Greater => (pair[0] + start).min(pair[1]),
+ Ordering::Equal => pair[0],
+ Ordering::Less => (pair[1] + start).max(pair[0]),
+ };
+ let new_end = match length {
+ Some(length) => (length + new_start).min(pair[1]),
+ None => pair[1],
+ };
+ len_so_far += new_end - new_start;
+ new_starts_ends.push((new_start, new_end));
+ new_offsets.push(len_so_far);
+ });
+
+ // concatenate substrings into a buffer
+ let mut new_values =
+ MutableBuffer::new(new_offsets.last().unwrap().to_usize().unwrap());
+
+ new_starts_ends
+ .iter()
+ .map(|(start, end)| {
+ let start = start.to_usize().unwrap();
+ let end = end.to_usize().unwrap();
+ &data[start..end]
+ })
+ .for_each(|slice| new_values.extend_from_slice(slice));
+
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ <OffsetSize as BinaryOffsetSizeTrait>::DATA_TYPE,
+ array.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![Buffer::from_slice_ref(&new_offsets), new_values.into()],
+ vec![],
+ )
+ };
+ Ok(make_array(data))
+}
+
+/// substring by byte
+fn utf8_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: Option<OffsetSize>,
@@ -128,8 +190,8 @@ fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
/// ```
///
/// # Error
-/// - The function errors when the passed array is not a \[Large\]String array.
-/// - The function errors if the offset of a substring in the input array is at invalid char boundary.
+/// - The function errors when the passed array is not a \[Large\]String array or \[Large\]Binary array.
+/// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array).
///
/// ## Example of trying to get an invalid utf-8 format substring
/// ```
@@ -141,7 +203,23 @@ fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
/// ```
pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<ArrayRef> {
match array.data_type() {
- DataType::LargeUtf8 => generic_substring(
+ DataType::LargeBinary => binary_substring(
+ array
+ .as_any()
+ .downcast_ref::<LargeBinaryArray>()
+ .expect("A large binary is expected"),
+ start,
+ length.map(|e| e as i64),
+ ),
+ DataType::Binary => binary_substring(
+ array
+ .as_any()
+ .downcast_ref::<BinaryArray>()
+ .expect("A binary is expected"),
+ start as i32,
+ length.map(|e| e as i32),
+ ),
+ DataType::LargeUtf8 => utf8_substring(
array
.as_any()
.downcast_ref::<LargeStringArray>()
@@ -149,7 +227,7 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
start,
length.map(|e| e as i64),
),
- DataType::Utf8 => generic_substring(
+ DataType::Utf8 => utf8_substring(
array
.as_any()
.downcast_ref::<StringArray>()
@@ -168,8 +246,214 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
mod tests {
use super::*;
- fn with_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
- ) -> Result<()> {
+ #[allow(clippy::type_complexity)]
+ fn with_nulls_generic_binary<O: BinaryOffsetSizeTrait>() -> Result<()> {
+ let cases: Vec<(Vec<Option<&[u8]>>, i64, Option<u64>, Vec<Option<&[u8]>>)> = vec![
+ // identity
+ (
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ 0,
+ None,
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ ),
+ // 0 length -> Nothing
+ (
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ 0,
+ Some(0),
+ vec![Some(&[]), None, Some(&[])],
+ ),
+ // high start -> Nothing
+ (
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ 1000,
+ Some(0),
+ vec![Some(&[]), None, Some(&[])],
+ ),
+ // high negative start -> identity
+ (
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ -1000,
+ None,
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ ),
+ // high length -> identity
+ (
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ 0,
+ Some(1000),
+ vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
+ ),
+ ];
+
+ cases.into_iter().try_for_each::<_, Result<()>>(
+ |(array, start, length, expected)| {
+ let array = GenericBinaryArray::<O>::from(array);
+ let result: ArrayRef = substring(&array, start, length)?;
+ assert_eq!(array.len(), result.len());
+
+ let result = result
+ .as_any()
+ .downcast_ref::<GenericBinaryArray<O>>()
+ .unwrap();
+ let expected = GenericBinaryArray::<O>::from(expected);
+ assert_eq!(&expected, result);
+ Ok(())
+ },
+ )?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn with_nulls_binary() -> Result<()> {
+ with_nulls_generic_binary::<i32>()
+ }
+
+ #[test]
+ fn with_nulls_large_binary() -> Result<()> {
+ with_nulls_generic_binary::<i64>()
+ }
+
+ #[allow(clippy::type_complexity)]
+ fn without_nulls_generic_binary<O: BinaryOffsetSizeTrait>() -> Result<()> {
+ let cases: Vec<(Vec<&[u8]>, i64, Option<u64>, Vec<&[u8]>)> = vec![
+ // increase start
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 0,
+ None,
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 1,
+ None,
+ vec![b"ello", b"", &[0xf9, 0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 2,
+ None,
+ vec![b"llo", b"", &[0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 3,
+ None,
+ vec![b"lo", b"", &[0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 10,
+ None,
+ vec![b"", b"", b""],
+ ),
+ // increase start negatively
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -1,
+ None,
+ vec![b"o", b"", &[0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -2,
+ None,
+ vec![b"lo", b"", &[0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -3,
+ None,
+ vec![b"llo", b"", &[0xf9, 0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -10,
+ None,
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ ),
+ // increase length
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 1,
+ Some(1),
+ vec![b"e", b"", &[0xf9]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 1,
+ Some(2),
+ vec![b"el", b"", &[0xf9, 0xff]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 1,
+ Some(3),
+ vec![b"ell", b"", &[0xf9, 0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ 1,
+ Some(4),
+ vec![b"ello", b"", &[0xf9, 0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -3,
+ Some(1),
+ vec![b"l", b"", &[0xf9]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -3,
+ Some(2),
+ vec![b"ll", b"", &[0xf9, 0xff]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -3,
+ Some(3),
+ vec![b"llo", b"", &[0xf9, 0xff, 0xfa]],
+ ),
+ (
+ vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
+ -3,
+ Some(4),
+ vec![b"llo", b"", &[0xf9, 0xff, 0xfa]],
+ ),
+ ];
+
+ cases.into_iter().try_for_each::<_, Result<()>>(
+ |(array, start, length, expected)| {
+ let array = GenericBinaryArray::<O>::from(array);
+ let result = substring(&array, start, length)?;
+ assert_eq!(array.len(), result.len());
+ let result = result
+ .as_any()
+ .downcast_ref::<GenericBinaryArray<O>>()
+ .unwrap();
+ let expected = GenericBinaryArray::<O>::from(expected);
+ assert_eq!(&expected, result,);
+ Ok(())
+ },
+ )?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn without_nulls_binary() -> Result<()> {
+ without_nulls_generic_binary::<i32>()
+ }
+
+ #[test]
+ fn without_nulls_large_binary() -> Result<()> {
+ without_nulls_generic_binary::<i64>()
+ }
+
+ fn with_nulls_generic_string<O: StringOffsetSizeTrait>() -> Result<()> {
let cases = vec![
// identity
(
@@ -210,12 +494,15 @@ mod tests {
cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
- let array = T::from(array);
+ let array = GenericStringArray::<O>::from(array);
let result: ArrayRef = substring(&array, start, length)?;
assert_eq!(array.len(), result.len());
- let result = result.as_any().downcast_ref::<T>().unwrap();
- let expected = T::from(expected);
+ let result = result
+ .as_any()
+ .downcast_ref::<GenericStringArray<O>>()
+ .unwrap();
+ let expected = GenericStringArray::<O>::from(expected);
assert_eq!(&expected, result);
Ok(())
},
@@ -226,16 +513,15 @@ mod tests {
#[test]
fn with_nulls_string() -> Result<()> {
- with_nulls::<StringArray>()
+ with_nulls_generic_string::<i32>()
}
#[test]
fn with_nulls_large_string() -> Result<()> {
- with_nulls::<LargeStringArray>()
+ with_nulls_generic_string::<i64>()
}
- fn without_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
- ) -> Result<()> {
+ fn without_nulls_generic_string<O: StringOffsetSizeTrait>() -> Result<()> {
let cases = vec![
// increase start
(
@@ -291,11 +577,14 @@ mod tests {
cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
- let array = StringArray::from(array);
+ let array = GenericStringArray::<O>::from(array);
let result = substring(&array, start, length)?;
assert_eq!(array.len(), result.len());
- let result = result.as_any().downcast_ref::<StringArray>().unwrap();
- let expected = StringArray::from(expected);
+ let result = result
+ .as_any()
+ .downcast_ref::<GenericStringArray<O>>()
+ .unwrap();
+ let expected = GenericStringArray::<O>::from(expected);
assert_eq!(&expected, result,);
Ok(())
},
@@ -306,12 +595,12 @@ mod tests {
#[test]
fn without_nulls_string() -> Result<()> {
- without_nulls::<StringArray>()
+ without_nulls_generic_string::<i32>()
}
#[test]
fn without_nulls_large_string() -> Result<()> {
- without_nulls::<LargeStringArray>()
+ without_nulls_generic_string::<i64>()
}
#[test]