You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2023/01/12 19:50:52 UTC

[arrow-rs] branch master updated: Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502)

This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9ae0c9bee Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502)
9ae0c9bee is described below

commit 9ae0c9bee87da07063eee9849cbdb055bf227543
Author: Steve Vaughan <em...@stevevaughan.me>
AuthorDate: Thu Jan 12 14:50:45 2023 -0500

    Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502)
    
    * Extract Regex implementation from dict function
    
    Extract the implementation comparing 2 ArrayAccessors from the generated dict function so that it can be used for other string comparisons (i.e. starts_with, ends_with, and contains).  The new functions replace the use of the macro parameters pat, neg, and typ.
    
    * Provide SQL operation for documenation
    
    Provide the entire SQL operation instead of generating it based on assumptions about the syntax of "like"-based operations.   This will allow it to be used for other comparison operations.
    
    * feat: Implement SQL STARTSWITH, ENDSWITH, and CONTAINS
    
    * Add missing documentation for public functions
    
    * Remove the dependency on arrow-ord
    
    Duplicate compare_op and compare_op_scalar
    
    * Fix document duplication source without a link
    
    * fix: Helper functions shouldn't be public
    
    * Duplication comment was in the wrong file
    
    * Remove unused no_simd_compare_op
    
    This was accidentally included as part of the duplication of compare_op and compare_op_scalar from arrow_ord::comparison
    
    * Add unit tests
    
    * fix: Remove typo in documentation
    
    * fix: Be consistent with references to more details
    
    Co-authored-by: Steve Vaughan Jr <s_...@apple.com>
---
 arrow-string/src/like.rs | 320 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 287 insertions(+), 33 deletions(-)

diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index d8afa8d4c..c9cdb7bab 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -25,9 +25,45 @@ use arrow_select::take::take;
 use regex::Regex;
 use std::collections::HashMap;
 
+/// Helper function to perform boolean lambda function on values from two array accessors, this
+/// version does not attempt to use SIMD.
+///
+/// Duplicated from `arrow_ord::comparison`
+fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>(
+    left: T,
+    right: S,
+    op: F,
+) -> Result<BooleanArray, ArrowError>
+where
+    F: Fn(T::Item, S::Item) -> bool,
+{
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(
+            "Cannot perform comparison operation on arrays of different length"
+                .to_string(),
+        ));
+    }
+
+    Ok(BooleanArray::from_binary(left, right, op))
+}
+
+/// Helper function to perform boolean lambda function on values from array accessor, this
+/// version does not attempt to use SIMD.
+///
+/// Duplicated from `arrow_ord::comparison`
+fn compare_op_scalar<T: ArrayAccessor, F>(
+    left: T,
+    op: F,
+) -> Result<BooleanArray, ArrowError>
+where
+    F: Fn(T::Item) -> bool,
+{
+    Ok(BooleanArray::from_unary(left, op))
+}
+
 macro_rules! dyn_function {
     ($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => {
-#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")]
+#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")]
 /// [`LargeStringArray`], or [`DictionaryArray`] with values
 /// [`StringArray`]/[`LargeStringArray`].
 ///
@@ -67,14 +103,32 @@ pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, Arr
 
     }
 }
-dyn_function!("LIKE", like_dyn, like_utf8, like_dict);
-dyn_function!("NOT LIKE", nlike_dyn, nlike_utf8, nlike_dict);
-dyn_function!("ILIKE", ilike_dyn, ilike_utf8, ilike_dict);
-dyn_function!("NOT ILIKE", nilike_dyn, nilike_utf8, nilike_dict);
+dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict);
+dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict);
+dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict);
+dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict);
+dyn_function!(
+    "STARTSWITH(left, right)",
+    starts_with_dyn,
+    starts_with_utf8,
+    starts_with_dict
+);
+dyn_function!(
+    "ENDSWITH(left, right)",
+    ends_with_dyn,
+    ends_with_utf8,
+    ends_with_dict
+);
+dyn_function!(
+    "CONTAINS(left, right)",
+    contains_dyn,
+    contains_utf8,
+    contains_dict
+);
 
 macro_rules! scalar_dyn_function {
     ($sql:tt, $fn_name:tt, $fn_scalar:tt) => {
-#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")]
+#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")]
 /// [`LargeStringArray`], or [`DictionaryArray`] with values
 /// [`StringArray`]/[`LargeStringArray`] and a scalar.
 ///
@@ -115,15 +169,34 @@ pub fn $fn_name(
 }
     }
 }
-scalar_dyn_function!("LIKE", like_utf8_scalar_dyn, like_scalar);
-scalar_dyn_function!("NOT LIKE", nlike_utf8_scalar_dyn, nlike_scalar);
-scalar_dyn_function!("ILIKE", ilike_utf8_scalar_dyn, ilike_scalar);
-scalar_dyn_function!("NOT ILIKE", nilike_utf8_scalar_dyn, nilike_scalar);
+scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar);
+scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar);
+scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar);
+scalar_dyn_function!(
+    "left NOT ILIKE right",
+    nilike_utf8_scalar_dyn,
+    nilike_scalar
+);
+scalar_dyn_function!(
+    "STARTSWITH(left, right)",
+    starts_with_utf8_scalar_dyn,
+    starts_with_scalar
+);
+scalar_dyn_function!(
+    "ENDSWITH(left, right)",
+    ends_with_utf8_scalar_dyn,
+    ends_with_scalar
+);
+scalar_dyn_function!(
+    "CONTAINS(left, right)",
+    contains_utf8_scalar_dyn,
+    contains_scalar
+);
 
 macro_rules! dict_function {
-    ($sql:tt, $fn_name:tt, $pat:tt, $neg:expr, $typ:tt) => {
+    ($sql:tt, $fn_name:tt, $fn_impl:tt) => {
 
-#[doc = concat!("Perform SQL `left ", $sql ," right` operation on on [`DictionaryArray`] with values")]
+#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")]
 /// [`StringArray`]/[`LargeStringArray`].
 ///
 /// See the documentation on [`like_utf8`] for more details.
@@ -137,28 +210,13 @@ fn $fn_name<K: ArrowPrimitiveType>(
             let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
             let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
 
-            regex_like(left, right, $neg, |re_pattern| {
-                Regex::new(&format!($pat, re_pattern)).map_err(|e| {
-                    ArrowError::ComputeError(format!(
-                        "Unable to build regex from {} pattern: {}",
-                        $typ, e
-                    ))
-                })
-            })
+            $fn_impl(left, right)
         }
         (DataType::LargeUtf8, DataType::LargeUtf8) => {
             let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
             let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
 
-            regex_like(left, right, $neg, |re_pattern| {
-                Regex::new(&format!($pat, re_pattern)).map_err(|e| {
-                    ArrowError::ComputeError(format!(
-                        "Unable to build regex from {} pattern: {}",
-                        $typ,
-                        e
-                    ))
-                })
-            })
+            $fn_impl(left, right)
         }
         _ => Err(ArrowError::ComputeError(format!(
             "{} only supports DictionaryArray with Utf8 or LargeUtf8 values",
@@ -169,10 +227,13 @@ fn $fn_name<K: ArrowPrimitiveType>(
     }
 }
 
-dict_function!("LIKE", like_dict, "^{}$", false, "LIKE");
-dict_function!("NOT LIKE", nlike_dict, "^{}$", true, "LIKE");
-dict_function!("ILIKE", ilike_dict, "(?i)^{}$", false, "ILIKE");
-dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE");
+dict_function!("left LIKE right", like_dict, like);
+dict_function!("left NOT LIKE right", nlike_dict, nlike);
+dict_function!("left ILIKE right", ilike_dict, ilike);
+dict_function!("left NOT ILIKE right", nilike_dict, nilike);
+dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with);
+dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with);
+dict_function!("CONTAINS(left, right)", contains_dict, contains);
 
 /// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
 ///
@@ -195,6 +256,14 @@ dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE");
 pub fn like_utf8<OffsetSize: OffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    like(left, right)
+}
+
+#[inline]
+fn like<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
 ) -> Result<BooleanArray, ArrowError> {
     regex_like(left, right, false, |re_pattern| {
         Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
@@ -318,6 +387,14 @@ fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> {
 pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    nlike(left, right)
+}
+
+#[inline]
+fn nlike<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
 ) -> Result<BooleanArray, ArrowError> {
     regex_like(left, right, true, |re_pattern| {
         Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
@@ -358,6 +435,14 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
 pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    ilike(left, right)
+}
+
+#[inline]
+fn ilike<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
 ) -> Result<BooleanArray, ArrowError> {
     regex_like(left, right, false, |re_pattern| {
         Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
@@ -442,6 +527,14 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
 pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    nilike(left, right)
+}
+
+#[inline]
+fn nilike<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
 ) -> Result<BooleanArray, ArrowError> {
     regex_like(left, right, true, |re_pattern| {
         Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
@@ -533,6 +626,117 @@ where
     Ok(BooleanArray::from(data))
 }
 
+/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn starts_with_utf8<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    starts_with(left, right)
+}
+
+#[inline]
+fn starts_with<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
+) -> Result<BooleanArray, ArrowError> {
+    compare_op(left, right, |l, r| l.starts_with(r))
+}
+
+#[inline]
+fn starts_with_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+    left: L,
+    right: &str,
+) -> Result<BooleanArray, ArrowError> {
+    compare_op_scalar(left, |item| item.starts_with(right))
+}
+
+/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn starts_with_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &str,
+) -> Result<BooleanArray, ArrowError> {
+    starts_with_scalar(left, right)
+}
+
+/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ends_with_utf8<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    ends_with(left, right)
+}
+
+#[inline]
+fn ends_with<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
+) -> Result<BooleanArray, ArrowError> {
+    compare_op(left, right, |l, r| l.ends_with(r))
+}
+
+#[inline]
+fn ends_with_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+    left: L,
+    right: &str,
+) -> Result<BooleanArray, ArrowError> {
+    compare_op_scalar(left, |item| item.ends_with(right))
+}
+
+/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ends_with_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &str,
+) -> Result<BooleanArray, ArrowError> {
+    ends_with_scalar(left, right)
+}
+
+/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn contains_utf8<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+    contains(left, right)
+}
+
+#[inline]
+fn contains<'a, S: ArrayAccessor<Item = &'a str>>(
+    left: S,
+    right: S,
+) -> Result<BooleanArray, ArrowError> {
+    compare_op(left, right, |l, r| l.contains(r))
+}
+
+#[inline]
+fn contains_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+    left: L,
+    right: &str,
+) -> Result<BooleanArray, ArrowError> {
+    compare_op_scalar(left, |item| item.contains(right))
+}
+
+/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn contains_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &str,
+) -> Result<BooleanArray, ArrowError> {
+    contains_scalar(left, right)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -682,6 +886,18 @@ mod tests {
         vec![true, false, true, false]
     );
 
+    // Replicates `test_utf8_array_like_scalar_start` `test_utf8_array_like_scalar_dyn_start` to
+    // demonstrate that `SQL STARTSWITH` works as expected.
+    test_utf8_scalar!(
+        test_utf8_array_starts_with_scalar_start,
+        test_utf8_array_starts_with_scalar_dyn_start,
+        vec!["arrow", "parrow", "arrows", "arr"],
+        "arrow",
+        starts_with_utf8_scalar,
+        starts_with_utf8_scalar_dyn,
+        vec![true, false, true, false]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_like_scalar_end,
         test_utf8_array_like_scalar_dyn_end,
@@ -692,6 +908,18 @@ mod tests {
         vec![true, true, false, false]
     );
 
+    // Replicates `test_utf8_array_like_scalar_end` `test_utf8_array_like_scalar_dyn_end` to
+    // demonstrate that `SQL ENDSWITH` works as expected.
+    test_utf8_scalar!(
+        test_utf8_array_ends_with_scalar_end,
+        test_utf8_array_ends_with_scalar_dyn_end,
+        vec!["arrow", "parrow", "arrows", "arr"],
+        "arrow",
+        ends_with_utf8_scalar,
+        ends_with_utf8_scalar_dyn,
+        vec![true, true, false, false]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_like_scalar_equals,
         test_utf8_array_like_scalar_dyn_equals,
@@ -1011,6 +1239,32 @@ mod tests {
         vec![false, true, true, false, false, false, false, true, true, true]
     );
 
+    // Replicates `test_utf8_array_ilike_unicode_contains` and
+    // `test_utf8_array_ilike_unicode_contains_dyn` to
+    // demonstrate that `SQL CONTAINS` works as expected.
+    //
+    // NOTE: 5 of the values were changed because the original used a case insensitive `ilike`.
+    test_utf8_scalar!(
+        test_utf8_array_contains_unicode_contains,
+        test_utf8_array_contains_unicode_contains_dyn,
+        vec![
+            "sdlkdfFkoßsdfs",
+            "sdlkdFFkoSSdggs", // Original was case insensitive "sdlkdfFkoSSdggs"
+            "sdlkdFFkoSSsdsd", // Original was case insensitive "sdlkdfFkosssdsd"
+            "FkoS",
+            "Fkos",
+            "ffkoSS",
+            "ffkoß",
+            "😃sadlksFFkoSSsh😃klF", // Original was case insensitive "😃sadlksffkosSsh😃klF"
+            "😱slgFFkoSSsh😃klF",    // Original was case insensitive "😱slgffkosSsh😃klF"
+            "FFkoSS",                    // "FFKoSS"
+        ],
+        "FFkoSS",
+        contains_utf8_scalar,
+        contains_utf8_scalar_dyn,
+        vec![false, true, true, false, false, false, false, true, true, true]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_ilike_unicode_complex,
         test_utf8_array_ilike_unicode_complex_dyn,