You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/11/20 12:09:09 UTC

[arrow-rs] branch cherry_pick_ee1d1644 created (now 1859aa0)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch cherry_pick_ee1d1644
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git.


      at 1859aa0  add ilike comparitor (#874)

This branch includes the following new commits:

     new 1859aa0  add ilike comparitor (#874)

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[arrow-rs] 01/01: add ilike comparitor (#874)

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch cherry_pick_ee1d1644
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit 1859aa0a381224e1c794b0ce8575b3eba293f472
Author: Jordan Deitch <jw...@users.noreply.github.com>
AuthorDate: Tue Nov 9 08:56:18 2021 -0500

    add ilike comparitor (#874)
    
    * add ilike comparitor
    
    * add ilike comparitor
    
    Co-authored-by: Jordan Deitch <jd...@digitalocean.com>
---
 arrow/benches/comparison_kernels.rs     |  25 +++
 arrow/src/compute/kernels/comparison.rs | 270 +++++++++++++++++++++++---------
 2 files changed, 217 insertions(+), 78 deletions(-)

diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
index bfee9b9..94ff7df 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -119,6 +119,11 @@ fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
         .unwrap();
 }
 
+fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
+    ilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b))
+        .unwrap();
+}
+
 fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
     regexp_is_match_utf8_scalar(
         criterion::black_box(arr_a),
@@ -205,6 +210,26 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xx_xx%xxx"))
     });
 
+    c.bench_function("ilike_utf8 scalar equals", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xxXX"))
+    });
+
+    c.bench_function("ilike_utf8 scalar contains", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xxXX%"))
+    });
+
+    c.bench_function("ilike_utf8 scalar ends with", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xXXx%"))
+    });
+
+    c.bench_function("ilike_utf8 scalar starts with", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%XXXx"))
+    });
+
+    c.bench_function("ilike_utf8 scalar complex", |b| {
+        b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
+    });
+
     c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
         b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
     });
diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index 9d49e89..3b65f33 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -228,28 +228,23 @@ where
     compare_op_scalar_primitive!(left, right, op)
 }
 
-/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
-///
-/// There are two wildcards supported with the LIKE operator:
-///
-/// 1. `%` - The percent sign represents zero, one, or multiple characters
-/// 2. `_` - The underscore represents a single character
-///
-/// For example:
-/// ```
-/// use arrow::array::{StringArray, BooleanArray};
-/// use arrow::compute::like_utf8;
-///
-/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
-/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]);
+fn is_like_pattern(c: char) -> bool {
+    c == '%' || c == '_'
+}
+
+/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`]
 ///
-/// let result = like_utf8(&strings, &patterns).unwrap();
-/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
-/// ```
-pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
+/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`)
+fn regex_like<OffsetSize, F>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray> {
+    negate_regex: bool,
+    op: F,
+) -> Result<BooleanArray>
+where
+    OffsetSize: StringOffsetSizeTrait,
+    F: Fn(&str) -> Result<Regex>,
+{
     let mut map = HashMap::new();
     if left.len() != right.len() {
         return Err(ArrowError::ComputeError(
@@ -269,17 +264,16 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
             regex
         } else {
             let re_pattern = pat.replace("%", ".*").replace("_", ".");
-            let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
-                ArrowError::ComputeError(format!(
-                    "Unable to build regex from LIKE pattern: {}",
-                    e
-                ))
-            })?;
+            let re = op(&re_pattern)?;
             map.insert(pat, re);
             map.get(pat).unwrap()
         };
 
-        result.append(re.is_match(haystack));
+        result.append(if negate_regex {
+            !re.is_match(haystack)
+        } else {
+            re.is_match(haystack)
+        });
     }
 
     let data = unsafe {
@@ -296,8 +290,36 @@ pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
     Ok(BooleanArray::from(data))
 }
 
-fn is_like_pattern(c: char) -> bool {
-    c == '%' || c == '_'
+/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// There are two wildcards supported with the LIKE operator:
+///
+/// 1. `%` - The percent sign represents zero, one, or multiple characters
+/// 2. `_` - The underscore represents a single character
+///
+/// For example:
+/// ```
+/// use arrow::array::{StringArray, BooleanArray};
+/// use arrow::compute::like_utf8;
+///
+/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
+/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]);
+///
+/// let result = like_utf8(&strings, &patterns).unwrap();
+/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
+/// ```
+pub fn like_utf8<OffsetSize: StringOffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    regex_like(left, right, false, |re_pattern| {
+        Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from LIKE pattern: {}",
+                e
+            ))
+        })
+    })
 }
 
 /// Perform SQL `left LIKE right` operation on [`StringArray`] /
@@ -376,36 +398,55 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &GenericStringArray<OffsetSize>,
 ) -> Result<BooleanArray> {
-    let mut map = HashMap::new();
-    if left.len() != right.len() {
-        return Err(ArrowError::ComputeError(
-            "Cannot perform comparison operation on arrays of different length"
-                .to_string(),
-        ));
-    }
-
-    let null_bit_buffer =
-        combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
+    regex_like(left, right, true, |re_pattern| {
+        Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from LIKE pattern: {}",
+                e
+            ))
+        })
+    })
+}
 
+/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &str,
+) -> Result<BooleanArray> {
+    let null_bit_buffer = left.data().null_buffer().cloned();
     let mut result = BooleanBufferBuilder::new(left.len());
-    for i in 0..left.len() {
-        let haystack = left.value(i);
-        let pat = right.value(i);
-        let re = if let Some(ref regex) = map.get(pat) {
-            regex
-        } else {
-            let re_pattern = pat.replace("%", ".*").replace("_", ".");
-            let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
-                ArrowError::ComputeError(format!(
-                    "Unable to build regex from LIKE pattern: {}",
-                    e
-                ))
-            })?;
-            map.insert(pat, re);
-            map.get(pat).unwrap()
-        };
 
-        result.append(!re.is_match(haystack));
+    if !right.contains(is_like_pattern) {
+        // fast path, can use equals
+        for i in 0..left.len() {
+            result.append(left.value(i) != right);
+        }
+    } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
+    {
+        // fast path, can use ends_with
+        for i in 0..left.len() {
+            result.append(!left.value(i).starts_with(&right[..right.len() - 1]));
+        }
+    } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+        // fast path, can use starts_with
+        for i in 0..left.len() {
+            result.append(!left.value(i).ends_with(&right[1..]));
+        }
+    } else {
+        let re_pattern = right.replace("%", ".*").replace("_", ".");
+        let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from LIKE pattern: {}",
+                e
+            ))
+        })?;
+        for i in 0..left.len() {
+            let haystack = left.value(i);
+            result.append(!re.is_match(haystack));
+        }
     }
 
     let data = unsafe {
@@ -422,11 +463,29 @@ pub fn nlike_utf8<OffsetSize: StringOffsetSizeTrait>(
     Ok(BooleanArray::from(data))
 }
 
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_utf8<OffsetSize: StringOffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    regex_like(left, right, false, |re_pattern| {
+        Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from ILIKE pattern: {}",
+                e
+            ))
+        })
+    })
+}
+
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
 /// [`LargeStringArray`] and a scalar.
 ///
 /// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
+pub fn ilike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
     left: &GenericStringArray<OffsetSize>,
     right: &str,
 ) -> Result<BooleanArray> {
@@ -436,30 +495,38 @@ pub fn nlike_utf8_scalar<OffsetSize: StringOffsetSizeTrait>(
     if !right.contains(is_like_pattern) {
         // fast path, can use equals
         for i in 0..left.len() {
-            result.append(left.value(i) != right);
+            result.append(left.value(i) == right);
         }
     } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
     {
         // fast path, can use ends_with
         for i in 0..left.len() {
-            result.append(!left.value(i).starts_with(&right[..right.len() - 1]));
+            result.append(
+                left.value(i)
+                    .to_uppercase()
+                    .starts_with(&right[..right.len() - 1].to_uppercase()),
+            );
         }
     } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
         // fast path, can use starts_with
         for i in 0..left.len() {
-            result.append(!left.value(i).ends_with(&right[1..]));
+            result.append(
+                left.value(i)
+                    .to_uppercase()
+                    .ends_with(&right[1..].to_uppercase()),
+            );
         }
     } else {
         let re_pattern = right.replace("%", ".*").replace("_", ".");
-        let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+        let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
             ArrowError::ComputeError(format!(
-                "Unable to build regex from LIKE pattern: {}",
+                "Unable to build regex from ILIKE pattern: {}",
                 e
             ))
         })?;
         for i in 0..left.len() {
             let haystack = left.value(i);
-            result.append(!re.is_match(haystack));
+            result.append(re.is_match(haystack));
         }
     }
 
@@ -2128,21 +2195,6 @@ mod tests {
     );
 
     test_utf8!(
-        test_utf8_array_nlike,
-        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
-        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
-        nlike_utf8,
-        vec![false, false, false, true, true, false, true]
-    );
-    test_utf8_scalar!(
-        test_utf8_array_nlike_scalar,
-        vec!["arrow", "parquet", "datafusion", "flight"],
-        "%ar%",
-        nlike_utf8_scalar,
-        vec![false, false, true, true]
-    );
-
-    test_utf8!(
         test_utf8_array_eq,
         vec!["arrow", "arrow", "arrow", "arrow"],
         vec!["arrow", "parquet", "datafusion", "flight"],
@@ -2157,6 +2209,21 @@ mod tests {
         vec![true, false, false, false]
     );
 
+    test_utf8!(
+        test_utf8_array_nlike,
+        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
+        nlike_utf8,
+        vec![false, false, false, true, true, false, true]
+    );
+    test_utf8_scalar!(
+        test_utf8_array_nlike_scalar,
+        vec!["arrow", "parquet", "datafusion", "flight"],
+        "%ar%",
+        nlike_utf8_scalar,
+        vec![false, false, true, true]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_nlike_scalar_start,
         vec!["arrow", "parrow", "arrows", "arr"],
@@ -2190,6 +2257,53 @@ mod tests {
     );
 
     test_utf8!(
+        test_utf8_array_ilike,
+        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+        ilike_utf8,
+        vec![true, true, true, false, false, true, false]
+    );
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar,
+        vec!["arrow", "parquet", "datafusion", "flight"],
+        "%AR%",
+        ilike_utf8_scalar,
+        vec![true, true, false, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_start,
+        vec!["arrow", "parrow", "arrows", "ARR"],
+        "aRRow%",
+        ilike_utf8_scalar,
+        vec![true, false, true, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_end,
+        vec!["ArroW", "parrow", "ARRowS", "arr"],
+        "%arrow",
+        ilike_utf8_scalar,
+        vec![true, true, false, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_equals,
+        vec!["arrow", "parrow", "arrows", "arr"],
+        "arrow",
+        ilike_utf8_scalar,
+        vec![true, false, false, false]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_ilike_scalar_one,
+        vec!["arrow", "arrows", "parrow", "arr"],
+        "arrow_",
+        ilike_utf8_scalar,
+        vec![false, true, false, false]
+    );
+
+    test_utf8!(
         test_utf8_array_neq,
         vec!["arrow", "arrow", "arrow", "arrow"],
         vec!["arrow", "parquet", "datafusion", "flight"],