You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/06 13:28:20 UTC
[arrow-rs] branch master updated: Add `Substring_by_char` (#1784)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new dac2132f1 Add `Substring_by_char` (#1784)
dac2132f1 is described below
commit dac2132f157b1ef0dda04119d4a8b0221efeadec
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Mon Jun 6 21:28:14 2022 +0800
Add `Substring_by_char` (#1784)
* add substring by char
Signed-off-by: remzi <13...@gmail.com>
* update docs
Signed-off-by: remzi <13...@gmail.com>
* update docs
Signed-off-by: remzi <13...@gmail.com>
* add benchmark
Signed-off-by: remzi <13...@gmail.com>
* Update arrow/src/compute/kernels/substring.rs
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
* Update arrow/src/compute/kernels/substring.rs
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
* Update arrow/src/compute/kernels/substring.rs
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
* Update arrow/src/compute/kernels/substring.rs
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
arrow/benches/string_kernels.rs | 14 ++-
arrow/src/compute/kernels/substring.rs | 220 ++++++++++++++++++++++++++++++++-
2 files changed, 228 insertions(+), 6 deletions(-)
diff --git a/arrow/benches/string_kernels.rs b/arrow/benches/string_kernels.rs
index 7df52a6bb..6bbfc9c09 100644
--- a/arrow/benches/string_kernels.rs
+++ b/arrow/benches/string_kernels.rs
@@ -22,13 +22,21 @@ use criterion::Criterion;
extern crate arrow;
use arrow::array::*;
-use arrow::compute::kernels::substring::substring;
+use arrow::compute::kernels::substring::*;
use arrow::util::bench_util::*;
fn bench_substring(arr: &dyn Array, start: i64, length: Option<u64>) {
substring(criterion::black_box(arr), start, length).unwrap();
}
+fn bench_substring_by_char<O: OffsetSizeTrait>(
+ arr: &GenericStringArray<O>,
+ start: i64,
+ length: Option<u64>,
+) {
+ substring_by_char(criterion::black_box(arr), start, length).unwrap();
+}
+
fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let val_len = 1000;
@@ -44,6 +52,10 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64)))
});
+ c.bench_function("substring utf8 by char", |b| {
+ b.iter(|| bench_substring_by_char(&arr_string, 1, Some((val_len - 1) as u64)))
+ });
+
c.bench_function("substring fixed size binary array", |b| {
b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64)))
});
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index f1b6e8d4a..1954307e9 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -16,7 +16,8 @@
// under the License.
//! Defines kernel to extract a substring of an Array
-//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray
+//! Supported array types:
+//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
use crate::array::DictionaryArray;
use crate::buffer::MutableBuffer;
@@ -29,7 +30,7 @@ use crate::{
use std::cmp::Ordering;
use std::sync::Arc;
-/// Returns an ArrayRef with substrings of all the elements in `array`.
+/// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
///
/// # Arguments
///
@@ -38,7 +39,7 @@ use std::sync::Arc;
/// otherwise count from the end of the string.
///
/// * `length`(option) - The length of all substrings.
-/// If `length` is `None`, then the substring is from `start` to the end of the string.
+/// If `length` is [None], then the substring is from `start` to the end of the string.
///
/// Attention: Both `start` and `length` are counted by byte, not by char.
///
@@ -53,9 +54,10 @@ use std::sync::Arc;
/// ```
///
/// # Error
-/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary
-/// array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type.
+/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`]
+/// or [`DictionaryArray`] with supported array type as its value type.
/// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array).
+/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars.
///
/// ## Example of trying to get an invalid utf-8 format substring
/// ```
@@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
}
}
+/// # Arguments
+/// * `array` - The input string array
+///
+/// * `start` - The start index of all substrings.
+/// If `start >= 0`, then count from the start of the string,
+/// otherwise count from the end of the string.
+///
+/// * `length`(option) - The length of all substrings.
+/// If `length` is `None`, then the substring is from `start` to the end of the string.
+///
+/// Attention: Both `start` and `length` are counted by char.
+///
+/// # Performance
+/// This function is slower than [substring].
+/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer.
+/// It is recommended to use [substring] if the input array only contains ASCII chars.
+///
+/// # Basic usage
+/// ```
+/// # use arrow::array::StringArray;
+/// # use arrow::compute::kernels::substring::substring_by_char;
+/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
+/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
+/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
+/// ```
+pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ start: i64,
+ length: Option<u64>,
+) -> Result<GenericStringArray<OffsetSize>> {
+ Ok(array
+ .iter()
+ .map(|val| {
+ val.map(|val| {
+ let char_count = val.chars().count();
+ let start = if start >= 0 {
+ start.to_usize().unwrap().min(char_count)
+ } else {
+ char_count - (-start).to_usize().unwrap().min(char_count)
+ };
+ let length = length.map_or(char_count - start, |length| {
+ length.to_usize().unwrap().min(char_count - start)
+ });
+
+ val.chars().skip(start).take(length).collect::<String>()
+ })
+ })
+ .collect::<GenericStringArray<OffsetSize>>())
+}
+
fn binary_substring<OffsetSize: OffsetSizeTrait>(
array: &GenericBinaryArray<OffsetSize>,
start: OffsetSize,
@@ -1083,6 +1135,164 @@ mod tests {
generic_string_with_non_zero_offset::<i64>()
}
+ fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+ let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")];
+ let cases = vec![
+ // all-nulls array is always identical
+ (vec![None, None, None], 0, None, vec![None, None, None]),
+ // identity
+ (
+ input_vals.clone(),
+ 0,
+ None,
+ vec![Some("hello"), None, Some("Γ ⊢x:T")],
+ ),
+ // 0 length -> Nothing
+ (
+ input_vals.clone(),
+ 0,
+ Some(0),
+ vec![Some(""), None, Some("")],
+ ),
+ // high start -> Nothing
+ (
+ input_vals.clone(),
+ 1000,
+ Some(0),
+ vec![Some(""), None, Some("")],
+ ),
+ // high negative start -> identity
+ (
+ input_vals.clone(),
+ -1000,
+ None,
+ vec![Some("hello"), None, Some("Γ ⊢x:T")],
+ ),
+ // high length -> identity
+ (
+ input_vals.clone(),
+ 0,
+ Some(1000),
+ vec![Some("hello"), None, Some("Γ ⊢x:T")],
+ ),
+ ];
+
+ cases.into_iter().try_for_each::<_, Result<()>>(
+ |(array, start, length, expected)| {
+ let array = GenericStringArray::<O>::from(array);
+ let result = substring_by_char(&array, start, length)?;
+ assert_eq!(array.len(), result.len());
+
+ let expected = GenericStringArray::<O>::from(expected);
+ assert_eq!(expected, result);
+ Ok(())
+ },
+ )?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn with_nulls_string_by_char() -> Result<()> {
+ with_nulls_generic_string_by_char::<i32>()
+ }
+
+ #[test]
+ fn with_nulls_large_string_by_char() -> Result<()> {
+ with_nulls_generic_string_by_char::<i64>()
+ }
+
+ fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+ let input_vals = vec!["hello", "", "Γ ⊢x:T"];
+ let cases = vec![
+ // empty array is always identical
+ (vec!["", "", ""], 0, None, vec!["", "", ""]),
+ // increase start
+ (input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]),
+ (input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]),
+ (input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]),
+ (input_vals.clone(), 3, None, vec!["lo", "", "x:T"]),
+ (input_vals.clone(), 10, None, vec!["", "", ""]),
+ // increase start negatively
+ (input_vals.clone(), -1, None, vec!["o", "", "T"]),
+ (input_vals.clone(), -2, None, vec!["lo", "", ":T"]),
+ (input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]),
+ (input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]),
+ // increase length
+ (input_vals.clone(), 1, Some(1), vec!["e", "", " "]),
+ (input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]),
+ (input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]),
+ (input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]),
+ (input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]),
+ (input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]),
+ (input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]),
+ (input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]),
+ ];
+
+ cases.into_iter().try_for_each::<_, Result<()>>(
+ |(array, start, length, expected)| {
+ let array = GenericStringArray::<O>::from(array);
+ let result = substring_by_char(&array, start, length)?;
+ assert_eq!(array.len(), result.len());
+ let expected = GenericStringArray::<O>::from(expected);
+ assert_eq!(expected, result);
+ Ok(())
+ },
+ )?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn without_nulls_string_by_char() -> Result<()> {
+ without_nulls_generic_string_by_char::<i32>()
+ }
+
+ #[test]
+ fn without_nulls_large_string_by_char() -> Result<()> {
+ without_nulls_generic_string_by_char::<i64>()
+ }
+
+ fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() -> Result<()> {
+ let values = "S→T = Πx:S.T";
+ let offsets = &[
+ O::zero(),
+ O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap())
+ .unwrap(),
+ O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap())
+ .unwrap(),
+ O::from_usize(values.len()).unwrap(),
+ ];
+ // set the first and third element to be valid
+ let bitmap = [0b101_u8];
+
+ let data = ArrayData::builder(GenericStringArray::<O>::get_data_type())
+ .len(2)
+ .add_buffer(Buffer::from_slice_ref(offsets))
+ .add_buffer(Buffer::from(values))
+ .null_bit_buffer(Some(Buffer::from(bitmap)))
+ .offset(1)
+ .build()?;
+ // array is `[null, "Πx:S.T"]`
+ let array = GenericStringArray::<O>::from(data);
+ // result is `[null, "x:S.T"]`
+ let result = substring_by_char(&array, 1, None)?;
+ let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
+ assert_eq!(result, expected);
+
+ Ok(())
+ }
+
+ #[test]
+ fn string_with_non_zero_offset_by_char() -> Result<()> {
+ generic_string_by_char_with_non_zero_offset::<i32>()
+ }
+
+ #[test]
+ fn large_string_with_non_zero_offset_by_char() -> Result<()> {
+ generic_string_by_char_with_non_zero_offset::<i64>()
+ }
+
#[test]
fn dictionary() -> Result<()> {
_dictionary::<Int8Type>()?;