You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/12/09 20:09:59 UTC

[arrow-rs] branch master updated: Use take for dictionary like comparisons (#3313)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f078aede7 Use take for dictionary like comparisons (#3313)
f078aede7 is described below

commit f078aede7a82c5373ebc08ede16307b0268dce89
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Dec 9 20:09:54 2022 +0000

    Use take for dictionary like comparisons (#3313)
    
    * Use take for like comparisons
    
    * Fix benchmark name
    
    * Format
---
 arrow-string/Cargo.toml             |   1 +
 arrow-string/src/like.rs            | 121 +++++-------------------------------
 arrow/benches/comparison_kernels.rs |  24 ++++++-
 3 files changed, 39 insertions(+), 107 deletions(-)

diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml
index fa32ab6dc..7dd4472f5 100644
--- a/arrow-string/Cargo.toml
+++ b/arrow-string/Cargo.toml
@@ -42,6 +42,7 @@ arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" }
 arrow-data = { version = "29.0.0", path = "../arrow-data" }
 arrow-schema = { version = "29.0.0", path = "../arrow-schema" }
 arrow-array = { version = "29.0.0", path = "../arrow-array" }
+arrow-select = { version = "29.0.0", path = "../arrow-select" }
 regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
 regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] }
 
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index c8a4d37cd..2e0356e73 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -21,6 +21,7 @@ use arrow_array::*;
 use arrow_data::bit_mask::combine_option_bitmap;
 use arrow_data::ArrayData;
 use arrow_schema::*;
+use arrow_select::take::take;
 use regex::Regex;
 use std::collections::HashMap;
 
@@ -214,7 +215,10 @@ pub fn like_utf8_scalar_dyn(
         DataType::Dictionary(_, _) => {
             downcast_dictionary_array!(
                 left => {
-                    like_dict_scalar(left, right)
+                    let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?;
+                    // TODO: Use take_boolean (#2967)
+                    let array = take(&dict_comparison, left.keys(), None)?;
+                    Ok(BooleanArray::from(array.data().clone()))
                 }
                 t => Err(ArrowError::ComputeError(format!(
                     "Should be DictionaryArray but got: {}", t
@@ -240,31 +244,6 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
     like_scalar(left, right)
 }
 
-/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn like_dict_scalar<K: ArrowPrimitiveType>(
-    left: &DictionaryArray<K>,
-    right: &str,
-) -> Result<BooleanArray, ArrowError> {
-    match left.value_type() {
-        DataType::Utf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
-            like_scalar(left, right)
-        }
-        DataType::LargeUtf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
-            like_scalar(left, right)
-        }
-        _ => {
-            Err(ArrowError::ComputeError(
-                "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
-            ))
-        }
-    }
-}
-
 /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
 ///
 /// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
@@ -431,7 +410,10 @@ pub fn nlike_utf8_scalar_dyn(
         DataType::Dictionary(_, _) => {
             downcast_dictionary_array!(
                 left => {
-                    nlike_dict_scalar(left, right)
+                    let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?;
+                    // TODO: Use take_boolean (#2967)
+                    let array = take(&dict_comparison, left.keys(), None)?;
+                    Ok(BooleanArray::from(array.data().clone()))
                 }
                 t => Err(ArrowError::ComputeError(format!(
                     "Should be DictionaryArray but got: {}", t
@@ -457,31 +439,6 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
     nlike_scalar(left, right)
 }
 
-/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn nlike_dict_scalar<K: ArrowPrimitiveType>(
-    left: &DictionaryArray<K>,
-    right: &str,
-) -> Result<BooleanArray, ArrowError> {
-    match left.value_type() {
-        DataType::Utf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
-            nlike_scalar(left, right)
-        }
-        DataType::LargeUtf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
-            nlike_scalar(left, right)
-        }
-        _ => {
-            Err(ArrowError::ComputeError(
-                "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
-            ))
-        }
-    }
-}
-
 /// Perform SQL `left ILIKE right` operation on [`StringArray`] /
 /// [`LargeStringArray`].
 ///
@@ -663,7 +620,10 @@ pub fn ilike_utf8_scalar_dyn(
         DataType::Dictionary(_, _) => {
             downcast_dictionary_array!(
                 left => {
-                    ilike_dict_scalar(left, right)
+                    let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
+                    // TODO: Use take_boolean (#2967)
+                    let array = take(&dict_comparison, left.keys(), None)?;
+                    Ok(BooleanArray::from(array.data().clone()))
                 }
                 t => Err(ArrowError::ComputeError(format!(
                     "Should be DictionaryArray but got: {}", t
@@ -689,31 +649,6 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
     ilike_scalar(left, right)
 }
 
-/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn ilike_dict_scalar<K: ArrowPrimitiveType>(
-    left: &DictionaryArray<K>,
-    right: &str,
-) -> Result<BooleanArray, ArrowError> {
-    match left.value_type() {
-        DataType::Utf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
-            ilike_scalar(left, right)
-        }
-        DataType::LargeUtf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
-            ilike_scalar(left, right)
-        }
-        _ => {
-            Err(ArrowError::ComputeError(
-                "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
-            ))
-        }
-    }
-}
-
 /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
 /// [`LargeStringArray`].
 ///
@@ -843,7 +778,10 @@ pub fn nilike_utf8_scalar_dyn(
         DataType::Dictionary(_, _) => {
             downcast_dictionary_array!(
                 left => {
-                    nilike_dict_scalar(left, right)
+                    let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
+                    // TODO: Use take_boolean (#2967)
+                    let array = take(&dict_comparison, left.keys(), None)?;
+                    Ok(BooleanArray::from(array.data().clone()))
                 }
                 t => Err(ArrowError::ComputeError(format!(
                     "Should be DictionaryArray but got: {}", t
@@ -869,31 +807,6 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
     nilike_scalar(left, right)
 }
 
-/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn nilike_dict_scalar<K: ArrowPrimitiveType>(
-    left: &DictionaryArray<K>,
-    right: &str,
-) -> Result<BooleanArray, ArrowError> {
-    match left.value_type() {
-        DataType::Utf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
-            nilike_scalar(left, right)
-        }
-        DataType::LargeUtf8 => {
-            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
-            nilike_scalar(left, right)
-        }
-        _ => {
-            Err(ArrowError::ComputeError(
-                "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
-            ))
-        }
-    }
-}
-
 fn is_like_pattern(c: char) -> bool {
     c == '%' || c == '_'
 }
diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
index 99229ed0b..7b3b935bc 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -314,12 +314,30 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
     });
 
-    let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0, 4);
-    let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0, 4);
+    let strings = create_string_array::<i32>(20, 0.);
+    let dict_arr_a = create_dict_from_values::<Int32Type>(size, 0., &strings);
+    let dict_arr_b = create_dict_from_values::<Int32Type>(size, 0., &strings);
 
-    c.bench_function("dict eq string", |b| {
+    c.bench_function("eq dictionary[10] string[4])", |b| {
         b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
     });
+
+    c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| {
+        b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test"))
+    });
+
+    c.bench_function(
+        "gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])",
+        |b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")),
+    );
+
+    c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| {
+        b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test"))
+    });
+
+    c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| {
+        b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test"))
+    });
 }
 
 criterion_group!(benches, add_benchmark);