You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/06 13:28:20 UTC

[arrow-rs] branch master updated: Add `Substring_by_char` (#1784)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new dac2132f1 Add `Substring_by_char` (#1784)
dac2132f1 is described below

commit dac2132f157b1ef0dda04119d4a8b0221efeadec
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Mon Jun 6 21:28:14 2022 +0800

    Add `Substring_by_char` (#1784)
    
    * add substring by char
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * update docs
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * update docs
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add benchmark
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * Update arrow/src/compute/kernels/substring.rs
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    * Update arrow/src/compute/kernels/substring.rs
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    * Update arrow/src/compute/kernels/substring.rs
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    * Update arrow/src/compute/kernels/substring.rs
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
---
 arrow/benches/string_kernels.rs        |  14 ++-
 arrow/src/compute/kernels/substring.rs | 220 ++++++++++++++++++++++++++++++++-
 2 files changed, 228 insertions(+), 6 deletions(-)

diff --git a/arrow/benches/string_kernels.rs b/arrow/benches/string_kernels.rs
index 7df52a6bb..6bbfc9c09 100644
--- a/arrow/benches/string_kernels.rs
+++ b/arrow/benches/string_kernels.rs
@@ -22,13 +22,21 @@ use criterion::Criterion;
 extern crate arrow;
 
 use arrow::array::*;
-use arrow::compute::kernels::substring::substring;
+use arrow::compute::kernels::substring::*;
 use arrow::util::bench_util::*;
 
 fn bench_substring(arr: &dyn Array, start: i64, length: Option<u64>) {
     substring(criterion::black_box(arr), start, length).unwrap();
 }
 
+fn bench_substring_by_char<O: OffsetSizeTrait>(
+    arr: &GenericStringArray<O>,
+    start: i64,
+    length: Option<u64>,
+) {
+    substring_by_char(criterion::black_box(arr), start, length).unwrap();
+}
+
 fn add_benchmark(c: &mut Criterion) {
     let size = 65536;
     let val_len = 1000;
@@ -44,6 +52,10 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64)))
     });
 
+    c.bench_function("substring utf8 by char", |b| {
+        b.iter(|| bench_substring_by_char(&arr_string, 1, Some((val_len - 1) as u64)))
+    });
+
     c.bench_function("substring fixed size binary array", |b| {
         b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64)))
     });
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index f1b6e8d4a..1954307e9 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -16,7 +16,8 @@
 // under the License.
 
 //! Defines kernel to extract a substring of an Array
-//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray
+//! Supported array types:
+//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
 
 use crate::array::DictionaryArray;
 use crate::buffer::MutableBuffer;
@@ -29,7 +30,7 @@ use crate::{
 use std::cmp::Ordering;
 use std::sync::Arc;
 
-/// Returns an ArrayRef with substrings of all the elements in `array`.
+/// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
 ///
 /// # Arguments
 ///
@@ -38,7 +39,7 @@ use std::sync::Arc;
 /// otherwise count from the end of the string.
 ///
 /// * `length`(option) - The length of all substrings.
-/// If `length` is `None`, then the substring is from `start` to the end of the string.
+/// If `length` is [None], then the substring is from `start` to the end of the string.
 ///
 /// Attention: Both `start` and `length` are counted by byte, not by char.
 ///
@@ -53,9 +54,10 @@ use std::sync::Arc;
 /// ```
 ///
 /// # Error
-/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary
-///   array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type.
+/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`]
+///   or [`DictionaryArray`] with supported array type as its value type.
 /// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array).
+/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars.
 ///
 /// ## Example of trying to get an invalid utf-8 format substring
 /// ```
@@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
     }
 }
 
+/// # Arguments
+/// * `array` - The input string array
+///
+/// * `start` - The start index of all substrings.
+/// If `start >= 0`, then count from the start of the string,
+/// otherwise count from the end of the string.
+///
+/// * `length`(option) - The length of all substrings.
+/// If `length` is `None`, then the substring is from `start` to the end of the string.
+///
+/// Attention: Both `start` and `length` are counted by char.
+///
+/// # Performance
+/// This function is slower than [substring].
+/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer.
+/// It is recommended to use [substring] if the input array only contains ASCII chars.
+///
+/// # Basic usage
+/// ```
+/// # use arrow::array::StringArray;
+/// # use arrow::compute::kernels::substring::substring_by_char;
+/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
+/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
+/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
+/// ```
+pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
+    array: &GenericStringArray<OffsetSize>,
+    start: i64,
+    length: Option<u64>,
+) -> Result<GenericStringArray<OffsetSize>> {
+    Ok(array
+        .iter()
+        .map(|val| {
+            val.map(|val| {
+                let char_count = val.chars().count();
+                let start = if start >= 0 {
+                    start.to_usize().unwrap().min(char_count)
+                } else {
+                    char_count - (-start).to_usize().unwrap().min(char_count)
+                };
+                let length = length.map_or(char_count - start, |length| {
+                    length.to_usize().unwrap().min(char_count - start)
+                });
+
+                val.chars().skip(start).take(length).collect::<String>()
+            })
+        })
+        .collect::<GenericStringArray<OffsetSize>>())
+}
+
 fn binary_substring<OffsetSize: OffsetSizeTrait>(
     array: &GenericBinaryArray<OffsetSize>,
     start: OffsetSize,
@@ -1083,6 +1135,164 @@ mod tests {
         generic_string_with_non_zero_offset::<i64>()
     }
 
+    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+        let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")];
+        let cases = vec![
+            // all-nulls array is always identical
+            (vec![None, None, None], 0, None, vec![None, None, None]),
+            // identity
+            (
+                input_vals.clone(),
+                0,
+                None,
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+            // 0 length -> Nothing
+            (
+                input_vals.clone(),
+                0,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high start -> Nothing
+            (
+                input_vals.clone(),
+                1000,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high negative start -> identity
+            (
+                input_vals.clone(),
+                -1000,
+                None,
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+            // high length -> identity
+            (
+                input_vals.clone(),
+                0,
+                Some(1000),
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array = GenericStringArray::<O>::from(array);
+                let result = substring_by_char(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+
+                let expected = GenericStringArray::<O>::from(expected);
+                assert_eq!(expected, result);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn with_nulls_string_by_char() -> Result<()> {
+        with_nulls_generic_string_by_char::<i32>()
+    }
+
+    #[test]
+    fn with_nulls_large_string_by_char() -> Result<()> {
+        with_nulls_generic_string_by_char::<i64>()
+    }
+
+    fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+        let input_vals = vec!["hello", "", "Γ ⊢x:T"];
+        let cases = vec![
+            // empty array is always identical
+            (vec!["", "", ""], 0, None, vec!["", "", ""]),
+            // increase start
+            (input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]),
+            (input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]),
+            (input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]),
+            (input_vals.clone(), 3, None, vec!["lo", "", "x:T"]),
+            (input_vals.clone(), 10, None, vec!["", "", ""]),
+            // increase start negatively
+            (input_vals.clone(), -1, None, vec!["o", "", "T"]),
+            (input_vals.clone(), -2, None, vec!["lo", "", ":T"]),
+            (input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]),
+            (input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]),
+            // increase length
+            (input_vals.clone(), 1, Some(1), vec!["e", "", " "]),
+            (input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]),
+            (input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]),
+            (input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]),
+            (input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]),
+            (input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]),
+            (input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]),
+            (input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array = GenericStringArray::<O>::from(array);
+                let result = substring_by_char(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+                let expected = GenericStringArray::<O>::from(expected);
+                assert_eq!(expected, result);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn without_nulls_string_by_char() -> Result<()> {
+        without_nulls_generic_string_by_char::<i32>()
+    }
+
+    #[test]
+    fn without_nulls_large_string_by_char() -> Result<()> {
+        without_nulls_generic_string_by_char::<i64>()
+    }
+
+    fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() -> Result<()> {
+        let values = "S→T = Πx:S.T";
+        let offsets = &[
+            O::zero(),
+            O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap())
+                .unwrap(),
+            O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap())
+                .unwrap(),
+            O::from_usize(values.len()).unwrap(),
+        ];
+        // set the first and third element to be valid
+        let bitmap = [0b101_u8];
+
+        let data = ArrayData::builder(GenericStringArray::<O>::get_data_type())
+            .len(2)
+            .add_buffer(Buffer::from_slice_ref(offsets))
+            .add_buffer(Buffer::from(values))
+            .null_bit_buffer(Some(Buffer::from(bitmap)))
+            .offset(1)
+            .build()?;
+        // array is `[null, "Πx:S.T"]`
+        let array = GenericStringArray::<O>::from(data);
+        // result is `[null, "x:S.T"]`
+        let result = substring_by_char(&array, 1, None)?;
+        let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
+        assert_eq!(result, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn string_with_non_zero_offset_by_char() -> Result<()> {
+        generic_string_by_char_with_non_zero_offset::<i32>()
+    }
+
+    #[test]
+    fn large_string_with_non_zero_offset_by_char() -> Result<()> {
+        generic_string_by_char_with_non_zero_offset::<i64>()
+    }
+
     #[test]
     fn dictionary() -> Result<()> {
         _dictionary::<Int8Type>()?;