You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/06/06 10:37:48 UTC

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #1784: Add `Substring_by_char`

tustvold commented on code in PR #1784:
URL: https://github.com/apache/arrow-rs/pull/1784#discussion_r890006811


##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -29,7 +30,7 @@ use crate::{
 use std::cmp::Ordering;
 use std::sync::Arc;
 
-/// Returns an ArrayRef with substrings of all the elements in `array`.
+/// Returns an [ArrayRef] with substrings of all the elements in `array`.

Review Comment:
   ```suggestion
   /// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
   ```



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -53,9 +54,10 @@ use std::sync::Arc;
 /// ```
 ///
 /// # Error
-/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary
-///   array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type.
+/// - The function errors when the passed array is not a [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray]
+///   or [DictionaryArray] with supported array type as its value type.

Review Comment:
   ```suggestion
   ///   or [`DictionaryArray`] with supported array type as its value type.
   ```



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -53,9 +54,10 @@ use std::sync::Arc;
 /// ```
 ///
 /// # Error
-/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary
-///   array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type.
+/// - The function errors when the passed array is not a [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray]

Review Comment:
   ```suggestion
   /// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`]
   ```



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -1083,6 +1135,164 @@ mod tests {
         generic_string_with_non_zero_offset::<i64>()
     }
 
+    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+        let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")];
+        let cases = vec![
+            // all-nulls array is always identical
+            (vec![None, None, None], 0, None, vec![None, None, None]),
+            // identity
+            (
+                input_vals.clone(),
+                0,
+                None,
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+            // 0 length -> Nothing
+            (
+                input_vals.clone(),
+                0,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high start -> Nothing
+            (
+                input_vals.clone(),
+                1000,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high negative start -> identity
+            (
+                input_vals.clone(),
+                -1000,
+                None,
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+            // high length -> identity
+            (
+                input_vals.clone(),
+                0,
+                Some(1000),
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array = GenericStringArray::<O>::from(array);
+                let result = substring_by_char(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+
+                let expected = GenericStringArray::<O>::from(expected);
+                assert_eq!(expected, result);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn with_nulls_string_by_char() -> Result<()> {
+        with_nulls_generic_string_by_char::<i32>()
+    }
+
+    #[test]
+    fn with_nulls_large_string_by_char() -> Result<()> {
+        with_nulls_generic_string_by_char::<i64>()
+    }
+
+    fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+        let input_vals = vec!["hello", "", "Γ ⊢x:T"];
+        let cases = vec![
+            // empty array is always identical
+            (vec!["", "", ""], 0, None, vec!["", "", ""]),
+            // increase start
+            (input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]),
+            (input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]),
+            (input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]),
+            (input_vals.clone(), 3, None, vec!["lo", "", "x:T"]),
+            (input_vals.clone(), 10, None, vec!["", "", ""]),
+            // increase start negatively
+            (input_vals.clone(), -1, None, vec!["o", "", "T"]),
+            (input_vals.clone(), -2, None, vec!["lo", "", ":T"]),
+            (input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]),
+            (input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]),
+            // increase length
+            (input_vals.clone(), 1, Some(1), vec!["e", "", " "]),
+            (input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]),
+            (input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]),
+            (input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]),
+            (input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]),
+            (input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]),
+            (input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]),
+            (input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(

Review Comment:
   Perhaps we could extract this as a function instead of duplicating it?



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -53,9 +54,10 @@ use std::sync::Arc;
 /// ```
 ///
 /// # Error
-/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary
-///   array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type.
+/// - The function errors when the passed array is not a [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray]
+///   or [DictionaryArray] with supported array type as its value type.
 /// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array).
+/// It is recommended to use [substring_by_char] if the input array contains non-ASCII chars.

Review Comment:
   ```suggestion
   /// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars.
   ```



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -1083,6 +1135,164 @@ mod tests {
         generic_string_with_non_zero_offset::<i64>()
     }
 
+    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+        let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")];
+        let cases = vec![
+            // all-nulls array is always identical
+            (vec![None, None, None], 0, None, vec![None, None, None]),
+            // identity
+            (
+                input_vals.clone(),
+                0,
+                None,
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+            // 0 length -> Nothing
+            (
+                input_vals.clone(),
+                0,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high start -> Nothing
+            (
+                input_vals.clone(),
+                1000,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high negative start -> identity
+            (
+                input_vals.clone(),
+                -1000,
+                None,
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+            // high length -> identity
+            (
+                input_vals.clone(),
+                0,
+                Some(1000),
+                vec![Some("hello"), None, Some("Γ ⊢x:T")],
+            ),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array = GenericStringArray::<O>::from(array);
+                let result = substring_by_char(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+
+                let expected = GenericStringArray::<O>::from(expected);
+                assert_eq!(expected, result);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn with_nulls_string_by_char() -> Result<()> {
+        with_nulls_generic_string_by_char::<i32>()
+    }
+
+    #[test]
+    fn with_nulls_large_string_by_char() -> Result<()> {
+        with_nulls_generic_string_by_char::<i64>()
+    }
+
+    fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
+        let input_vals = vec!["hello", "", "Γ ⊢x:T"];
+        let cases = vec![
+            // empty array is always identical
+            (vec!["", "", ""], 0, None, vec!["", "", ""]),
+            // increase start
+            (input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]),
+            (input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]),
+            (input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]),
+            (input_vals.clone(), 3, None, vec!["lo", "", "x:T"]),
+            (input_vals.clone(), 10, None, vec!["", "", ""]),
+            // increase start negatively
+            (input_vals.clone(), -1, None, vec!["o", "", "T"]),
+            (input_vals.clone(), -2, None, vec!["lo", "", ":T"]),
+            (input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]),
+            (input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]),
+            // increase length
+            (input_vals.clone(), 1, Some(1), vec!["e", "", " "]),
+            (input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]),
+            (input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]),
+            (input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]),
+            (input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]),
+            (input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]),
+            (input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]),
+            (input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array = GenericStringArray::<O>::from(array);
+                let result = substring_by_char(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+                let expected = GenericStringArray::<O>::from(expected);
+                assert_eq!(expected, result);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn without_nulls_string_by_char() -> Result<()> {
+        without_nulls_generic_string_by_char::<i32>()
+    }
+
+    #[test]
+    fn without_nulls_large_string_by_char() -> Result<()> {
+        without_nulls_generic_string_by_char::<i64>()
+    }
+
+    fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() -> Result<()> {
+        let values = "S→T = Πx:S.T";

Review Comment:
   I think it would be easier to follow to just construct the regular array, and then call `array.slice(1, 2)`



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -1083,6 +1135,164 @@ mod tests {
         generic_string_with_non_zero_offset::<i64>()
     }
 
+    fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {

Review Comment:
   Loving the test coverage :+1:



##########
arrow/src/compute/kernels/substring.rs:
##########
@@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
     }
 }
 
+/// # Arguments
+/// * `array` - The input string array
+///
+/// * `start` - The start index of all substrings.
+/// If `start >= 0`, then count from the start of the string,
+/// otherwise count from the end of the string.
+///
+/// * `length`(option) - The length of all substrings.
+/// If `length` is `None`, then the substring is from `start` to the end of the string.
+///
+/// Attention: Both `start` and `length` are counted by char.
+///
+/// # Performance
+/// This function is slower than [substring].
+/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer.
+/// It is recommended to use [substring] if the input array only contains ASCII chars.
+///
+/// # Basic usage
+/// ```
+/// # use arrow::array::StringArray;
+/// # use arrow::compute::kernels::substring::substring_by_char;
+/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
+/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
+/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
+/// ```
+pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
+    array: &GenericStringArray<OffsetSize>,
+    start: i64,
+    length: Option<u64>,
+) -> Result<GenericStringArray<OffsetSize>> {
+    Ok(array
+        .iter()
+        .map(|val| {
+            val.map(|val| {
+                let char_count = val.chars().count();
+                let start = if start >= 0 {
+                    start.to_usize().unwrap().min(char_count)
+                } else {
+                    char_count - (-start).to_usize().unwrap().min(char_count)
+                };
+                let length = length.map_or(char_count - start, |length| {
+                    length.to_usize().unwrap().min(char_count - start)
+                });
+
+                val.chars().skip(start).take(length).collect::<String>()

Review Comment:
   Allocating a string temporary, only to copy out of it, is likely a significant portion of the slow-down. That combined with the null handling.
   
   This could definitely be handled as a separate PR, but you might want to consider doing something like (not tested).
   
   ```
   let nulls = // align bitmap to 0 offset, copying if already aligned
   let mut vals = BufferBuilder::<u8>::new(array.value_data().len());
   let mut indexes = BufferBuilder::<OffsetSize>::new(array.len() + 1);
   indexes.append(0);
   
   for val in array.iter() {
     let char_count = val.chars().count();
     let start = if start >= 0 {
         start.to_usize().unwrap().min(char_count)
     } else {
         char_count - (-start).to_usize().unwrap().min(char_count)
     };
     let length = length.map_or(char_count - start, |length| {
         length.to_usize().unwrap().min(char_count - start)
     });
   
     let mut start_byte = 0;
     let mut end_byte = val.len();
     for ((idx, (byte_idx, _)) in val.char_indices().enumerate() {
       if idx == start {
         start_byte = byte_idx;
       } else if idx == start + length {
         end_byte = byte_idx;
         break
       }
     }
     // Could even be unchecked
     vals.append_slice(&val[start_byte..end_byte]);
     indexes.append(vals.len() as _);
   }
   
   let data = ArrayDataBuilder::new(array.data_type()).len(array.len()).add_buffer(vals.finish())
     .add_buffer(indexes.finish()).add_buffer(vals.finish());
   
   Ok(GenericStringArray::<OffsetSize>::from(unsafe {data.build_unchecked()}))
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org