You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/06/15 13:13:55 UTC

[arrow-rs] branch master updated: Add `nilike` support in `comparison` (#1846)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9860aa78b Add `nilike` support in `comparison` (#1846)
9860aa78b is described below

commit 9860aa78bedd0a3b523e432c7036cb3243ac12eb
Author: Alex Qyoun-ae <40...@users.noreply.github.com>
AuthorDate: Wed Jun 15 17:13:49 2022 +0400

    Add `nilike` support in `comparison` (#1846)
---
 arrow/benches/comparison_kernels.rs     |  25 ++++++
 arrow/src/compute/kernels/comparison.rs | 137 ++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)

diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
index 4dced67ad..21d83e07e 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -124,6 +124,11 @@ fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
         .unwrap();
 }
 
+fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
+    nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b))
+        .unwrap();
+}
+
 fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
     regexp_is_match_utf8_scalar(
         criterion::black_box(arr_a),
@@ -254,6 +259,26 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
     });
 
+    c.bench_function("nilike_utf8 scalar equals", |b| {
+        b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xxXX"))
+    });
+
+    c.bench_function("nilike_utf8 scalar contains", |b| {
+        b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xxXX%"))
+    });
+
+    c.bench_function("nilike_utf8 scalar ends with", |b| {
+        b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%"))
+    });
+
+    c.bench_function("nilike_utf8 scalar starts with", |b| {
+        b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx"))
+    });
+
+    c.bench_function("nilike_utf8 scalar complex", |b| {
+        b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
+    });
+
     c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
         b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
     });
diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index acb9ac229..068b9dedf 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -548,6 +548,89 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
     Ok(BooleanArray::from(data))
 }
 
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray> {
+    regex_like(left, right, true, |re_pattern| {
+        Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from ILIKE pattern: {}",
+                e
+            ))
+        })
+    })
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+    left: &GenericStringArray<OffsetSize>,
+    right: &str,
+) -> Result<BooleanArray> {
+    let null_bit_buffer = left.data().null_buffer().cloned();
+    let mut result = BooleanBufferBuilder::new(left.len());
+
+    if !right.contains(is_like_pattern) {
+        // fast path, can use equals
+        for i in 0..left.len() {
+            result.append(left.value(i) != right);
+        }
+    } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
+    {
+        // fast path, can use ends_with
+        for i in 0..left.len() {
+            result.append(
+                !left
+                    .value(i)
+                    .to_uppercase()
+                    .starts_with(&right[..right.len() - 1].to_uppercase()),
+            );
+        }
+    } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+        // fast path, can use starts_with
+        for i in 0..left.len() {
+            result.append(
+                !left
+                    .value(i)
+                    .to_uppercase()
+                    .ends_with(&right[1..].to_uppercase()),
+            );
+        }
+    } else {
+        let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
+        let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+            ArrowError::ComputeError(format!(
+                "Unable to build regex from ILIKE pattern: {}",
+                e
+            ))
+        })?;
+        for i in 0..left.len() {
+            let haystack = left.value(i);
+            result.append(!re.is_match(haystack));
+        }
+    }
+
+    let data = unsafe {
+        ArrayData::new_unchecked(
+            DataType::Boolean,
+            left.len(),
+            None,
+            null_bit_buffer,
+            0,
+            vec![result.finish()],
+            vec![],
+        )
+    };
+    Ok(BooleanArray::from(data))
+}
+
 /// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
 /// If `regex_array` element has an empty value, the corresponding result value is always true.
 ///
@@ -3984,6 +4067,60 @@ mod tests {
         vec![false, true, false, false]
     );
 
+    test_utf8!(
+        test_utf8_array_nilike,
+        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+        nilike_utf8,
+        vec![false, false, false, true, true, false, true]
+    );
+    test_utf8_scalar!(
+        nilike_utf8_scalar_escape_testing,
+        vec!["varchar(255)", "int(255)", "varchar", "int"],
+        "%(%)%",
+        nilike_utf8_scalar,
+        vec![false, false, true, true]
+    );
+    test_utf8_scalar!(
+        test_utf8_array_nilike_scalar,
+        vec!["arrow", "parquet", "datafusion", "flight"],
+        "%AR%",
+        nilike_utf8_scalar,
+        vec![false, false, true, true]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_nilike_scalar_start,
+        vec!["arrow", "parrow", "arrows", "ARR"],
+        "aRRow%",
+        nilike_utf8_scalar,
+        vec![false, true, false, true]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_nilike_scalar_end,
+        vec!["ArroW", "parrow", "ARRowS", "arr"],
+        "%arrow",
+        nilike_utf8_scalar,
+        vec![false, false, true, true]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_nilike_scalar_equals,
+        vec!["arrow", "parrow", "arrows", "arr"],
+        "arrow",
+        nilike_utf8_scalar,
+        vec![false, true, true, true]
+    );
+
+    test_utf8_scalar!(
+        test_utf8_array_nilike_scalar_one,
+        vec!["arrow", "arrows", "parrow", "arr"],
+        "arrow_",
+        nilike_utf8_scalar,
+        vec![true, false, true, true]
+    );
+
     test_utf8!(
         test_utf8_array_neq,
         vec!["arrow", "arrow", "arrow", "arrow"],