You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/12/09 20:09:59 UTC
[arrow-rs] branch master updated: Use take for dictionary like comparisons (#3313)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f078aede7 Use take for dictionary like comparisons (#3313)
f078aede7 is described below
commit f078aede7a82c5373ebc08ede16307b0268dce89
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Dec 9 20:09:54 2022 +0000
Use take for dictionary like comparisons (#3313)
* Use take for like comparisons
* Fix benchmark name
* Format
---
arrow-string/Cargo.toml | 1 +
arrow-string/src/like.rs | 121 +++++-------------------------------
arrow/benches/comparison_kernels.rs | 24 ++++++-
3 files changed, 39 insertions(+), 107 deletions(-)
diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml
index fa32ab6dc..7dd4472f5 100644
--- a/arrow-string/Cargo.toml
+++ b/arrow-string/Cargo.toml
@@ -42,6 +42,7 @@ arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" }
arrow-data = { version = "29.0.0", path = "../arrow-data" }
arrow-schema = { version = "29.0.0", path = "../arrow-schema" }
arrow-array = { version = "29.0.0", path = "../arrow-array" }
+arrow-select = { version = "29.0.0", path = "../arrow-select" }
regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] }
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index c8a4d37cd..2e0356e73 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -21,6 +21,7 @@ use arrow_array::*;
use arrow_data::bit_mask::combine_option_bitmap;
use arrow_data::ArrayData;
use arrow_schema::*;
+use arrow_select::take::take;
use regex::Regex;
use std::collections::HashMap;
@@ -214,7 +215,10 @@ pub fn like_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
- like_dict_scalar(left, right)
+ let dict_comparison = like_utf8_scalar_dyn(left.values().as_ref(), right)?;
+ // TODO: Use take_boolean (#2967)
+ let array = take(&dict_comparison, left.keys(), None)?;
+ Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
@@ -240,31 +244,6 @@ pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
like_scalar(left, right)
}
-/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn like_dict_scalar<K: ArrowPrimitiveType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- like_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- like_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
@@ -431,7 +410,10 @@ pub fn nlike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
- nlike_dict_scalar(left, right)
+ let dict_comparison = nlike_utf8_scalar_dyn(left.values().as_ref(), right)?;
+ // TODO: Use take_boolean (#2967)
+ let array = take(&dict_comparison, left.keys(), None)?;
+ Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
@@ -457,31 +439,6 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
nlike_scalar(left, right)
}
-/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn nlike_dict_scalar<K: ArrowPrimitiveType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- nlike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- nlike_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
@@ -663,7 +620,10 @@ pub fn ilike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
- ilike_dict_scalar(left, right)
+ let dict_comparison = ilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
+ // TODO: Use take_boolean (#2967)
+ let array = take(&dict_comparison, left.keys(), None)?;
+ Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
@@ -689,31 +649,6 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
ilike_scalar(left, right)
}
-/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn ilike_dict_scalar<K: ArrowPrimitiveType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- ilike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- ilike_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
@@ -843,7 +778,10 @@ pub fn nilike_utf8_scalar_dyn(
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
left => {
- nilike_dict_scalar(left, right)
+ let dict_comparison = nilike_utf8_scalar_dyn(left.values().as_ref(), right)?;
+ // TODO: Use take_boolean (#2967)
+ let array = take(&dict_comparison, left.keys(), None)?;
+ Ok(BooleanArray::from(array.data().clone()))
}
t => Err(ArrowError::ComputeError(format!(
"Should be DictionaryArray but got: {}", t
@@ -869,31 +807,6 @@ pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
nilike_scalar(left, right)
}
-/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn nilike_dict_scalar<K: ArrowPrimitiveType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray, ArrowError> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- nilike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- nilike_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}
diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
index 99229ed0b..7b3b935bc 100644
--- a/arrow/benches/comparison_kernels.rs
+++ b/arrow/benches/comparison_kernels.rs
@@ -314,12 +314,30 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_regexp_is_match_utf8_scalar(&arr_string, "xx$"))
});
- let dict_arr_a = create_string_dict_array::<Int32Type>(size, 0.0, 4);
- let dict_arr_b = create_string_dict_array::<Int32Type>(size, 0.0, 4);
+ let strings = create_string_array::<i32>(20, 0.);
+ let dict_arr_a = create_dict_from_values::<Int32Type>(size, 0., &strings);
+ let dict_arr_b = create_dict_from_values::<Int32Type>(size, 0., &strings);
- c.bench_function("dict eq string", |b| {
+ c.bench_function("eq dictionary[10] string[4])", |b| {
b.iter(|| bench_dict_eq(&dict_arr_a, &dict_arr_b))
});
+
+ c.bench_function("eq_dyn_utf8_scalar dictionary[10] string[4])", |b| {
+ b.iter(|| eq_dyn_utf8_scalar(&dict_arr_a, "test"))
+ });
+
+ c.bench_function(
+ "gt_eq_dyn_utf8_scalar scalar dictionary[10] string[4])",
+ |b| b.iter(|| gt_eq_dyn_utf8_scalar(&dict_arr_a, "test")),
+ );
+
+ c.bench_function("like_utf8_scalar_dyn dictionary[10] string[4])", |b| {
+ b.iter(|| like_utf8_scalar_dyn(&dict_arr_a, "test"))
+ });
+
+ c.bench_function("ilike_utf8_scalar_dyn dictionary[10] string[4])", |b| {
+ b.iter(|| ilike_utf8_scalar_dyn(&dict_arr_a, "test"))
+ });
}
criterion_group!(benches, add_benchmark);