You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/06/15 13:13:55 UTC
[arrow-rs] branch master updated: Add `nilike` support in `comparison` (#1846)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9860aa78b Add `nilike` support in `comparison` (#1846)
9860aa78b is described below
commit 9860aa78bedd0a3b523e432c7036cb3243ac12eb
Author: Alex Qyoun-ae <40...@users.noreply.github.com>
AuthorDate: Wed Jun 15 17:13:49 2022 +0400
Add `nilike` support in `comparison` (#1846)
---
arrow/benches/comparison_kernels.rs | 25 ++++++
arrow/src/compute/kernels/comparison.rs | 137 ++++++++++++++++++++++++++++++++
2 files changed, 162 insertions(+)
diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
index 4dced67ad..21d83e07e 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -124,6 +124,11 @@ fn bench_ilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
.unwrap();
}
+fn bench_nilike_utf8_scalar(arr_a: &StringArray, value_b: &str) {
+ nilike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b))
+ .unwrap();
+}
+
fn bench_regexp_is_match_utf8_scalar(arr_a: &StringArray, value_b: &str) {
regexp_is_match_utf8_scalar(
criterion::black_box(arr_a),
@@ -254,6 +259,26 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
});
+ c.bench_function("nilike_utf8 scalar equals", |b| {
+ b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xxXX"))
+ });
+
+ c.bench_function("nilike_utf8 scalar contains", |b| {
+ b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xxXX%"))
+ });
+
+ c.bench_function("nilike_utf8 scalar ends with", |b| {
+ b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%"))
+ });
+
+ c.bench_function("nilike_utf8 scalar starts with", |b| {
+ b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx"))
+ });
+
+ c.bench_function("nilike_utf8 scalar complex", |b| {
+ b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xx_xX%xXX"))
+ });
+
c.bench_function("egexp_matches_utf8 scalar starts with", |b| {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "^xx"))
});
diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index acb9ac229..068b9dedf 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -548,6 +548,89 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
Ok(BooleanArray::from(data))
}
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray> {
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray> {
+ let null_bit_buffer = left.data().null_buffer().cloned();
+ let mut result = BooleanBufferBuilder::new(left.len());
+
+ if !right.contains(is_like_pattern) {
+ // fast path, can use equals
+ for i in 0..left.len() {
+ result.append(left.value(i) != right);
+ }
+ } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use ends_with
+ for i in 0..left.len() {
+ result.append(
+ !left
+ .value(i)
+ .to_uppercase()
+ .starts_with(&right[..right.len() - 1].to_uppercase()),
+ );
+ }
+ } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+ // fast path, can use starts_with
+ for i in 0..left.len() {
+ result.append(
+ !left
+ .value(i)
+ .to_uppercase()
+ .ends_with(&right[1..].to_uppercase()),
+ );
+ }
+ } else {
+ let re_pattern = escape(right).replace('%', ".*").replace('_', ".");
+ let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })?;
+ for i in 0..left.len() {
+ let haystack = left.value(i);
+ result.append(!re.is_match(haystack));
+ }
+ }
+
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::Boolean,
+ left.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![result.finish()],
+ vec![],
+ )
+ };
+ Ok(BooleanArray::from(data))
+}
+
/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
/// If `regex_array` element has an empty value, the corresponding result value is always true.
///
@@ -3984,6 +4067,60 @@ mod tests {
vec![false, true, false, false]
);
+ test_utf8!(
+ test_utf8_array_nilike,
+ vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+ nilike_utf8,
+ vec![false, false, false, true, true, false, true]
+ );
+ test_utf8_scalar!(
+ nilike_utf8_scalar_escape_testing,
+ vec!["varchar(255)", "int(255)", "varchar", "int"],
+ "%(%)%",
+ nilike_utf8_scalar,
+ vec![false, false, true, true]
+ );
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar,
+ vec!["arrow", "parquet", "datafusion", "flight"],
+ "%AR%",
+ nilike_utf8_scalar,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_start,
+ vec!["arrow", "parrow", "arrows", "ARR"],
+ "aRRow%",
+ nilike_utf8_scalar,
+ vec![false, true, false, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_end,
+ vec!["ArroW", "parrow", "ARRowS", "arr"],
+ "%arrow",
+ nilike_utf8_scalar,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_equals,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow",
+ nilike_utf8_scalar,
+ vec![false, true, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_one,
+ vec!["arrow", "arrows", "parrow", "arr"],
+ "arrow_",
+ nilike_utf8_scalar,
+ vec![true, false, true, true]
+ );
+
test_utf8!(
test_utf8_array_neq,
vec!["arrow", "arrow", "arrow", "arrow"],