You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/11/22 20:55:11 UTC

[arrow-rs] branch active_release updated: add ilike comparitor (#874) (#961)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/active_release by this push:
     new 59f96e8  add ilike comparitor (#874) (#961)
59f96e8 is described below

commit 59f96e842d05b63882f7ba285c66a9739761cf84
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Nov 22 15:55:07 2021 -0500

    add ilike comparitor (#874) (#961)
    
    * add ilike comparitor
    
    * add ilike comparitor
    
    Co-authored-by: Jordan Deitch <jd...@digitalocean.com>
    
    Co-authored-by: Jordan Deitch <jw...@users.noreply.github.com>
    Co-authored-by: Jordan Deitch <jd...@digitalocean.com>
---
 arrow/benches/comparison_kernels.rs     |  25 +++
 arrow/src/compute/kernels/comparison.rs | 270 +++++++++++++++++++++++---------
 2 files changed, 217 insertions(+), 78 deletions(-)

diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
index bfee9b9..94ff7df 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -119,6 +119,11 @@ fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
         .unwrap();
 }
 
+fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
+    ilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b))
+        .unwrap();
+}
+
 fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
     regexp_is_match_utf8_scalar(
         criterion::black_box(arr_a),
@@ -205,6 +210,26 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xx_xx%xxx"))
     });
 
+    c.bench_function("ilike_utf8 scalar equals", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xxXX"))
+    });
+
+    c.bench_function("ilike_utf8 scalar contains", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xxXX%"))
+    });
+
+    c.bench_function("ilike_utf8 scalar ends with", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xXXx%"))
+    });
+
+    c.bench_function("ilike_utf8 scalar starts with", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%XXXx"))
+    });
+
+    c.bench_function("ilike_utf8 scalar complex", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
+    });
+
     c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
         b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
     });
diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index 9d49e89..3b65f33 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -228,28 +228,23 @@ where
     compare_op_scalar_primitive!(left, right, op)
 }
 
-/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
-///
-/// There are two wildcards supported with the LIKE operator:
-///
-/// 1. `%` - The percent sign represents zero, one, or multiple characters
-/// 2. `_` - The underscore represents a single character
-///
-/// For example:
-/// ```
-/// use arrow::array::{StringArray, BooleanArray};
-/// use arrow::compute::like_utf8;
-///
-/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
-/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]);
+fn is_like_pattern(c: char) -> bool {
+    c == '%' || c == '_'
+}
+
+/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`]
 ///
-/// let result = like_utf8(&strings, &patterns).unwrap();
-/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
-/// ```
-pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
+/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`)
+fn regex_like<OffsetSize, F>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray> {
+    negate_regex: bool,
+    op: F,
+) -> Result<BooleanArray>
+where
+    OffsetSize: StringOffsetSizeTrait,
+    F: Fn(&str) -> Result<Regex>,
+{
     let mut map = HashMap::new();
     if left.len() != right.len() {
         return Err(ArrowError::ComputeError(
@@ -269,17 +264,16 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
             regex
         } else {
             let re_pattern = pat.replace("%", ".*").replace("_", ".");
-            let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
-                ArrowError::ComputeError(format!(
-                    "Unable to build regex from LIKE pattern: {}",
-                    e
-                ))
-            })?;
+            let re = op(&re_pattern)?;
             map.insert(pat, re);
             map.get(pat).unwrap()
         };
 
-        result.append(re.is_match(haystack));
+        result.append(if negate_regex {
+            !re.is_match(haystack)
+        } else {
+            re.is_match(haystack)
+        });
     }
 
     let data = unsafe {
@@ -296,8 +290,36 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
     Ok(BooleanArray::from(data))
 }
 
-fn is_like_pattern(c: char) -> bool {
-    c == '%' || c == '_'
+/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// There are two wildcards supported with the LIKE operator:
+///
+/// 1. `%` - The percent sign represents zero, one, or multiple characters
+/// 2. `_` - The underscore represents a single character
+///
+/// For example:
+/// ```
+/// use arrow::array::{StringArray, BooleanArray};
+/// use arrow::compute::like_utf8;
+///
+/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
+/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]);
+///
+/// let result = like_utf8(&strings, &patterns).unwrap();
+/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
+/// ```
+pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    regex_like(left, right, false, |re_pattern| {
+        Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from LIKE pattern: {}",
+                e
+            ))
+        })
+    })
 }
 
 /// Perform SQL `left LIKE right` operation on [`StringArray`] /
@@ -376,36 +398,55 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
 ) -> Result<BooleanArray> {
-    let mut map = HashMap::new();
-    if left.len() != right.len() {
-        return Err(ArrowError::ComputeError(
-            "Cannot perform comparison operation on arrays of different length"
-                .to_string(),
-        ));
-    }
-
-    let null_bit_buffer =
-        combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
+    regex_like(left, right, true, |re_pattern| {
+        Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from LIKE pattern: {}",
+                e
+            ))
+        })
+    })
+}
 
+/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &str,
+) -> Result<BooleanArray> {
+    let null_bit_buffer = left.data().null_buffer().cloned();
     let mut result = BooleanBufferBuilder::new(left.len());
-    for i in 0..left.len() {
-        let haystack = left.value(i);
-        let pat = right.value(i);
-        let re = if let Some(ref regex) = map.get(pat) {
-            regex
-        } else {
-            let re_pattern = pat.replace("%", ".*").replace("_", ".");
-            let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
-                ArrowError::ComputeError(format!(
-                    "Unable to build regex from LIKE pattern: {}",
-                    e
-                ))
-            })?;
-            map.insert(pat, re);
-            map.get(pat).unwrap()
-        };
 
-        result.append(!re.is_match(haystack));
+    if !right.contains(is_like_pattern) {
+        // fast path, can use equals
+        for i in 0..left.len() {
+            result.append(left.value(i) != right);
+        }
+    } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
+    {
+        // fast path, can use ends_with
+        for i in 0..left.len() {
+            result.append(!left.value(i).starts_with(&right[..right.len() - 1]));
+        }
+    } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+        // fast path, can use starts_with
+        for i in 0..left.len() {
+            result.append(!left.value(i).ends_with(&right[1..]));
+        }
+    } else {
+        let re_pattern = right.replace("%", ".*").replace("_", ".");
+        let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from LIKE pattern: {}",
+                e
+            ))
+        })?;
+        for i in 0..left.len() {
+            let haystack = left.value(i);
+            result.append(!re.is_match(haystack));
+        }
     }
 
     let data = unsafe {
@@ -422,11 +463,29 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>(
     Ok(BooleanArray::from(data))
 }
 
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_utf8<OffsetSize: StringOffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    regex_like(left, right, false, |re_pattern| {
+        Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from ILIKE pattern: {}",
+                e
+            ))
+        })
+    })
+}
+
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
 /// [`LargeStringArray`] and a scalar.
 ///
 /// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
+pub fn ilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &str,
 ) -> Result<BooleanArray> {
@@ -436,30 +495,38 @@ pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
     if !right.contains(is_like_pattern) {
         // fast path, can use equals
         for i in 0..left.len() {
-            result.append(left.value(i) != right);
+            result.append(left.value(i) == right);
         }
     } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
     {
         // fast path, can use ends_with
         for i in 0..left.len() {
-            result.append(!left.value(i).starts_with(&right[..right.len() - 1]));
+            result.append(
+                left.value(i)
+                    .to_uppercase()
+                    .starts_with(&right[..right.len() - 1].to_uppercase()),
+            );
         }
     } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
         // fast path, can use starts_with
         for i in 0..left.len() {
-            result.append(!left.value(i).ends_with(&right[1..]));
+            result.append(
+                left.value(i)
+                    .to_uppercase()
+                    .ends_with(&right[1..].to_uppercase()),
+            );
         }
     } else {
         let re_pattern = right.replace("%", ".*").replace("_", ".");
-        let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+        let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
             ArrowError::ComputeError(format!(
-                "Unable to build regex from LIKE pattern: {}",
+                "Unable to build regex from ILIKE pattern: {}",
                 e
             ))
         })?;
         for i in 0..left.len() {
             let haystack = left.value(i);
-            result.append(!re.is_match(haystack));
+            result.append(re.is_match(haystack));
         }
     }
 
@@ -2128,21 +2195,6 @@ mod tests {
     );
 
     test_utf8!(
-        test_utf8_array_nlike,
-        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
-        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
-        nlike_utf8,
-        vec![false, false, false, true, true, false, true]
-    );
-    test_utf8_scalar!(
-        test_utf8_array_nlike_scalar,
-        vec!["arrow", "parquet", "datafusion", "flight"],
-        "%ar%",
-        nlike_utf8_scalar,
-        vec![false, false, true, true]
-    );
-
-    test_utf8!(
         test_utf8_array_eq,
         vec!["arrow", "arrow", "arrow", "arrow"],
         vec!["arrow", "parquet", "datafusion", "flight"],
@@ -2157,6 +2209,21 @@ mod tests {
         vec![true, false, false, false]
     );
 
+    test_utf8!(
+        test_utf8_array_nlike,
+        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
+        nlike_utf8,
+        vec![false, false, false, true, true, false, true]
+    );
+    test_utf8_scalar!(
+        test_utf8_array_nlike_scalar,
+        vec!["arrow", "parquet", "datafusion", "flight"],
+        "%ar%",
+        nlike_utf8_scalar,
+        vec![false, false, true, true]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_start,
         vec!["arrow", "parrow", "arrows", "arr"],
@@ -2190,6 +2257,53 @@ mod tests {
     );
 
     test_utf8!(
+        test_utf8_array_ilike,
+        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+        ilike_utf8,
+        vec![true, true, true, false, false, true, false]
+    );
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar,
+        vec!["arrow", "parquet", "datafusion", "flight"],
+        "%AR%",
+        ilike_utf8_scalar,
+        vec![true, true, false, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_start,
+        vec!["arrow", "parrow", "arrows", "ARR"],
+        "aRRow%",
+        ilike_utf8_scalar,
+        vec![true, false, true, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_end,
+        vec!["ArroW", "parrow", "ARRowS", "arr"],
+        "%arrow",
+        ilike_utf8_scalar,
+        vec![true, true, false, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_equals,
+        vec!["arrow", "parrow", "arrows", "arr"],
+        "arrow",
+        ilike_utf8_scalar,
+        vec![true, false, false, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_one,
+        vec!["arrow", "arrows", "parrow", "arr"],
+        "arrow_",
+        ilike_utf8_scalar,
+        vec![false, true, false, false]
+    );
+
+    test_utf8!(
         test_utf8_array_neq,
         vec!["arrow", "arrow", "arrow", "arrow"],
         vec!["arrow", "parquet", "datafusion", "flight"],