You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2023/01/12 19:50:52 UTC
[arrow-rs] branch master updated: Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502)
This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9ae0c9bee Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502)
9ae0c9bee is described below
commit 9ae0c9bee87da07063eee9849cbdb055bf227543
Author: Steve Vaughan <em...@stevevaughan.me>
AuthorDate: Thu Jan 12 14:50:45 2023 -0500
Add string comparisons (starts_with, ends_with, and contains) to kernel (#3502)
* Extract Regex implementation from dict function
Extract the implementation comparing 2 ArrayAccessors from the generated dict function so that it can be used for other string comparisons (i.e. starts_with, ends_with, and contains). The new functions replace the use of the macro parameters pat, neg, and typ.
* Provide SQL operation for documenation
Provide the entire SQL operation instead of generating it based on assumptions about the syntax of "like"-based operations. This will allow it to be used for other comparison operations.
* feat: Implement SQL STARTSWITH, ENDSWITH, and CONTAINS
* Add missing documentation for public functions
* Remove the dependency on arrow-ord
Duplicate compare_op and compare_op_scalar
* Fix document duplication source without a link
* fix: Helper functions shouldn't be public
* Duplication comment was in the wrong file
* Remove unused no_simd_compare_op
This was accidentally included as part of the duplication of compare_op and compare_op_scalar from arrow_ord::comparison
* Add unit tests
* fix: Remove typo in documentation
* fix: Be consistent with references to more details
Co-authored-by: Steve Vaughan Jr <s_...@apple.com>
---
arrow-string/src/like.rs | 320 ++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 287 insertions(+), 33 deletions(-)
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index d8afa8d4c..c9cdb7bab 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -25,9 +25,45 @@ use arrow_select::take::take;
use regex::Regex;
use std::collections::HashMap;
+/// Helper function to perform boolean lambda function on values from two array accessors, this
+/// version does not attempt to use SIMD.
+///
+/// Duplicated from `arrow_ord::comparison`
+fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>(
+ left: T,
+ right: S,
+ op: F,
+) -> Result<BooleanArray, ArrowError>
+where
+ F: Fn(T::Item, S::Item) -> bool,
+{
+ if left.len() != right.len() {
+ return Err(ArrowError::ComputeError(
+ "Cannot perform comparison operation on arrays of different length"
+ .to_string(),
+ ));
+ }
+
+ Ok(BooleanArray::from_binary(left, right, op))
+}
+
+/// Helper function to perform boolean lambda function on values from array accessor, this
+/// version does not attempt to use SIMD.
+///
+/// Duplicated from `arrow_ord::comparison`
+fn compare_op_scalar<T: ArrayAccessor, F>(
+ left: T,
+ op: F,
+) -> Result<BooleanArray, ArrowError>
+where
+ F: Fn(T::Item) -> bool,
+{
+ Ok(BooleanArray::from_unary(left, op))
+}
+
macro_rules! dyn_function {
($sql:tt, $fn_name:tt, $fn_utf8:tt, $fn_dict:tt) => {
-#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")]
+#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")]
/// [`LargeStringArray`], or [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`].
///
@@ -67,14 +103,32 @@ pub fn $fn_name(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, Arr
}
}
-dyn_function!("LIKE", like_dyn, like_utf8, like_dict);
-dyn_function!("NOT LIKE", nlike_dyn, nlike_utf8, nlike_dict);
-dyn_function!("ILIKE", ilike_dyn, ilike_utf8, ilike_dict);
-dyn_function!("NOT ILIKE", nilike_dyn, nilike_utf8, nilike_dict);
+dyn_function!("left LIKE right", like_dyn, like_utf8, like_dict);
+dyn_function!("left NOT LIKE right", nlike_dyn, nlike_utf8, nlike_dict);
+dyn_function!("left ILIKE right", ilike_dyn, ilike_utf8, ilike_dict);
+dyn_function!("left NOT ILIKE right", nilike_dyn, nilike_utf8, nilike_dict);
+dyn_function!(
+ "STARTSWITH(left, right)",
+ starts_with_dyn,
+ starts_with_utf8,
+ starts_with_dict
+);
+dyn_function!(
+ "ENDSWITH(left, right)",
+ ends_with_dyn,
+ ends_with_utf8,
+ ends_with_dict
+);
+dyn_function!(
+ "CONTAINS(left, right)",
+ contains_dyn,
+ contains_utf8,
+ contains_dict
+);
macro_rules! scalar_dyn_function {
($sql:tt, $fn_name:tt, $fn_scalar:tt) => {
-#[doc = concat!("Perform SQL `left ", $sql ," right` operation on [`StringArray`] /")]
+#[doc = concat!("Perform SQL `", $sql ,"` operation on [`StringArray`] /")]
/// [`LargeStringArray`], or [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
@@ -115,15 +169,34 @@ pub fn $fn_name(
}
}
}
-scalar_dyn_function!("LIKE", like_utf8_scalar_dyn, like_scalar);
-scalar_dyn_function!("NOT LIKE", nlike_utf8_scalar_dyn, nlike_scalar);
-scalar_dyn_function!("ILIKE", ilike_utf8_scalar_dyn, ilike_scalar);
-scalar_dyn_function!("NOT ILIKE", nilike_utf8_scalar_dyn, nilike_scalar);
+scalar_dyn_function!("left LIKE right", like_utf8_scalar_dyn, like_scalar);
+scalar_dyn_function!("left NOT LIKE right", nlike_utf8_scalar_dyn, nlike_scalar);
+scalar_dyn_function!("left ILIKE right", ilike_utf8_scalar_dyn, ilike_scalar);
+scalar_dyn_function!(
+ "left NOT ILIKE right",
+ nilike_utf8_scalar_dyn,
+ nilike_scalar
+);
+scalar_dyn_function!(
+ "STARTSWITH(left, right)",
+ starts_with_utf8_scalar_dyn,
+ starts_with_scalar
+);
+scalar_dyn_function!(
+ "ENDSWITH(left, right)",
+ ends_with_utf8_scalar_dyn,
+ ends_with_scalar
+);
+scalar_dyn_function!(
+ "CONTAINS(left, right)",
+ contains_utf8_scalar_dyn,
+ contains_scalar
+);
macro_rules! dict_function {
- ($sql:tt, $fn_name:tt, $pat:tt, $neg:expr, $typ:tt) => {
+ ($sql:tt, $fn_name:tt, $fn_impl:tt) => {
-#[doc = concat!("Perform SQL `left ", $sql ," right` operation on on [`DictionaryArray`] with values")]
+#[doc = concat!("Perform SQL `", $sql ,"` operation on [`DictionaryArray`] with values")]
/// [`StringArray`]/[`LargeStringArray`].
///
/// See the documentation on [`like_utf8`] for more details.
@@ -137,28 +210,13 @@ fn $fn_name<K: ArrowPrimitiveType>(
let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
- regex_like(left, right, $neg, |re_pattern| {
- Regex::new(&format!($pat, re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from {} pattern: {}",
- $typ, e
- ))
- })
- })
+ $fn_impl(left, right)
}
(DataType::LargeUtf8, DataType::LargeUtf8) => {
let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
- regex_like(left, right, $neg, |re_pattern| {
- Regex::new(&format!($pat, re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from {} pattern: {}",
- $typ,
- e
- ))
- })
- })
+ $fn_impl(left, right)
}
_ => Err(ArrowError::ComputeError(format!(
"{} only supports DictionaryArray with Utf8 or LargeUtf8 values",
@@ -169,10 +227,13 @@ fn $fn_name<K: ArrowPrimitiveType>(
}
}
-dict_function!("LIKE", like_dict, "^{}$", false, "LIKE");
-dict_function!("NOT LIKE", nlike_dict, "^{}$", true, "LIKE");
-dict_function!("ILIKE", ilike_dict, "(?i)^{}$", false, "ILIKE");
-dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE");
+dict_function!("left LIKE right", like_dict, like);
+dict_function!("left NOT LIKE right", nlike_dict, nlike);
+dict_function!("left ILIKE right", ilike_dict, ilike);
+dict_function!("left NOT ILIKE right", nilike_dict, nilike);
+dict_function!("STARTSWITH(left, right)", starts_with_dict, starts_with);
+dict_function!("ENDSWITH(left, right)", ends_with_dict, ends_with);
+dict_function!("CONTAINS(left, right)", contains_dict, contains);
/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
///
@@ -195,6 +256,14 @@ dict_function!("NOT ILIKE", nilike_dict, "(?i)^{}$", true, "ILIKE");
pub fn like_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ like(left, right)
+}
+
+#[inline]
+fn like<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
) -> Result<BooleanArray, ArrowError> {
regex_like(left, right, false, |re_pattern| {
Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
@@ -318,6 +387,14 @@ fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> {
pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ nlike(left, right)
+}
+
+#[inline]
+fn nlike<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
) -> Result<BooleanArray, ArrowError> {
regex_like(left, right, true, |re_pattern| {
Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
@@ -358,6 +435,14 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ ilike(left, right)
+}
+
+#[inline]
+fn ilike<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
) -> Result<BooleanArray, ArrowError> {
regex_like(left, right, false, |re_pattern| {
Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
@@ -442,6 +527,14 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ nilike(left, right)
+}
+
+#[inline]
+fn nilike<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
) -> Result<BooleanArray, ArrowError> {
regex_like(left, right, true, |re_pattern| {
Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
@@ -533,6 +626,117 @@ where
Ok(BooleanArray::from(data))
}
+/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn starts_with_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ starts_with(left, right)
+}
+
+#[inline]
+fn starts_with<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
+) -> Result<BooleanArray, ArrowError> {
+ compare_op(left, right, |l, r| l.starts_with(r))
+}
+
+#[inline]
+fn starts_with_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ compare_op_scalar(left, |item| item.starts_with(right))
+}
+
+/// Perform SQL `STARTSWITH(left, right)` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn starts_with_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ starts_with_scalar(left, right)
+}
+
+/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ends_with_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ ends_with(left, right)
+}
+
+#[inline]
+fn ends_with<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
+) -> Result<BooleanArray, ArrowError> {
+ compare_op(left, right, |l, r| l.ends_with(r))
+}
+
+#[inline]
+fn ends_with_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ compare_op_scalar(left, |item| item.ends_with(right))
+}
+
+/// Perform SQL `ENDSWITH(left, right)` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ends_with_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ ends_with_scalar(left, right)
+}
+
+/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn contains_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ contains(left, right)
+}
+
+#[inline]
+fn contains<'a, S: ArrayAccessor<Item = &'a str>>(
+ left: S,
+ right: S,
+) -> Result<BooleanArray, ArrowError> {
+ compare_op(left, right, |l, r| l.contains(r))
+}
+
+#[inline]
+fn contains_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ compare_op_scalar(left, |item| item.contains(right))
+}
+
+/// Perform SQL `CONTAINS(left, right)` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn contains_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ contains_scalar(left, right)
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -682,6 +886,18 @@ mod tests {
vec![true, false, true, false]
);
+ // Replicates `test_utf8_array_like_scalar_start` `test_utf8_array_like_scalar_dyn_start` to
+ // demonstrate that `SQL STARTSWITH` works as expected.
+ test_utf8_scalar!(
+ test_utf8_array_starts_with_scalar_start,
+ test_utf8_array_starts_with_scalar_dyn_start,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow",
+ starts_with_utf8_scalar,
+ starts_with_utf8_scalar_dyn,
+ vec![true, false, true, false]
+ );
+
test_utf8_scalar!(
test_utf8_array_like_scalar_end,
test_utf8_array_like_scalar_dyn_end,
@@ -692,6 +908,18 @@ mod tests {
vec![true, true, false, false]
);
+ // Replicates `test_utf8_array_like_scalar_end` `test_utf8_array_like_scalar_dyn_end` to
+ // demonstrate that `SQL ENDSWITH` works as expected.
+ test_utf8_scalar!(
+ test_utf8_array_ends_with_scalar_end,
+ test_utf8_array_ends_with_scalar_dyn_end,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow",
+ ends_with_utf8_scalar,
+ ends_with_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
test_utf8_scalar!(
test_utf8_array_like_scalar_equals,
test_utf8_array_like_scalar_dyn_equals,
@@ -1011,6 +1239,32 @@ mod tests {
vec![false, true, true, false, false, false, false, true, true, true]
);
+ // Replicates `test_utf8_array_ilike_unicode_contains` and
+ // `test_utf8_array_ilike_unicode_contains_dyn` to
+ // demonstrate that `SQL CONTAINS` works as expected.
+ //
+ // NOTE: 5 of the values were changed because the original used a case insensitive `ilike`.
+ test_utf8_scalar!(
+ test_utf8_array_contains_unicode_contains,
+ test_utf8_array_contains_unicode_contains_dyn,
+ vec![
+ "sdlkdfFkoßsdfs",
+ "sdlkdFFkoSSdggs", // Original was case insensitive "sdlkdfFkoSSdggs"
+ "sdlkdFFkoSSsdsd", // Original was case insensitive "sdlkdfFkosssdsd"
+ "FkoS",
+ "Fkos",
+ "ffkoSS",
+ "ffkoß",
+ "😃sadlksFFkoSSsh😃klF", // Original was case insensitive "😃sadlksffkosSsh😃klF"
+ "😱slgFFkoSSsh😃klF", // Original was case insensitive "😱slgffkosSsh😃klF"
+ "FFkoSS", // "FFKoSS"
+ ],
+ "FFkoSS",
+ contains_utf8_scalar,
+ contains_utf8_scalar_dyn,
+ vec![false, true, true, false, false, false, false, true, true, true]
+ );
+
test_utf8_scalar!(
test_utf8_array_ilike_unicode_complex,
test_utf8_array_ilike_unicode_complex_dyn,