You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/28 10:52:45 UTC

[arrow-rs] branch master updated: Add _dyn kernels of like, ilike, nlike, nilike kernels for dictionary support (#3197)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new a6daff5fc Add _dyn kernels of like, ilike, nlike, nilike kernels for dictionary support (#3197)
a6daff5fc is described below

commit a6daff5fcc360f9c570cc20cae26b53373af8d9b
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Mon Nov 28 02:52:39 2022 -0800

    Add _dyn kernels of like, ilike, nlike, nilike kernels for dictionary support (#3197)
    
    * Add dictionary suppport to like, ilike, nlike, nilike kernels
    
    * Add _dyn kernels for dictionary support
    
    * Gated by feature dyn_cmp_dict
---
 arrow/src/compute/kernels/comparison.rs | 385 +++++++++++++++++++++++++++++++-
 1 file changed, 380 insertions(+), 5 deletions(-)

diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index 10cab4889..33a24500a 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -140,14 +140,13 @@ fn is_like_pattern(c: char) -> bool {
 /// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`]
 ///
 /// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`)
-fn regex_like<OffsetSize, F>(
-    left: &GenericStringArray<OffsetSize>,
-    right: &GenericStringArray<OffsetSize>,
+fn regex_like<'a, S: ArrayAccessor<Item = &'a str>, F>(
+    left: S,
+    right: S,
     negate_regex: bool,
     op: F,
 ) -> Result<BooleanArray>
 where
-    OffsetSize: OffsetSizeTrait,
     F: Fn(&str) -> Result<Regex>,
 {
     let mut map = HashMap::new();
@@ -227,6 +226,86 @@ pub fn like_utf8<OffsetSize: OffsetSizeTrait>(
     })
 }
 
+/// Perform SQL `left LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`], or [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
+    match (left.data_type(), right.data_type()) {
+        (DataType::Utf8, DataType::Utf8)  => {
+            let left = as_string_array(left);
+            let right = as_string_array(right);
+            like_utf8(left, right)
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = as_largestring_array(left);
+            let right = as_largestring_array(right);
+            like_utf8(left, right)
+        }
+        #[cfg(feature = "dyn_cmp_dict")]
+        (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+            downcast_dictionary_array!(
+                left => {
+                    let right = as_dictionary_array(right);
+                    like_dict(left, right)
+                }
+                t => Err(ArrowError::ComputeError(format!(
+                    "Should be DictionaryArray but got: {}", t
+                )))
+            )
+        }
+        _ => {
+            Err(ArrowError::ComputeError(
+                "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+            ))
+        }
+    }
+}
+
+/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn like_dict<K: ArrowNumericType>(
+    left: &DictionaryArray<K>,
+    right: &DictionaryArray<K>,
+) -> Result<BooleanArray> {
+    match (left.value_type(), right.value_type()) {
+        (DataType::Utf8, DataType::Utf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+            regex_like(left, right, false, |re_pattern| {
+                Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from LIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+            regex_like(left, right, false, |re_pattern| {
+                Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from LIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        _ => Err(ArrowError::ComputeError(
+            "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+                .to_string(),
+        )),
+    }
+}
+
 #[inline]
 fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>(
     left: L,
@@ -402,6 +481,85 @@ pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>(
     })
 }
 
+/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
+    match (left.data_type(), right.data_type()) {
+        (DataType::Utf8, DataType::Utf8)  => {
+            let left = as_string_array(left);
+            let right = as_string_array(right);
+            nlike_utf8(left, right)
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = as_largestring_array(left);
+            let right = as_largestring_array(right);
+            nlike_utf8(left, right)
+        }
+        #[cfg(feature = "dyn_cmp_dict")]
+        (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+            downcast_dictionary_array!(
+                left => {
+                    let right = as_dictionary_array(right);
+                    nlike_dict(left, right)
+                }
+                t => Err(ArrowError::ComputeError(format!(
+                    "Should be DictionaryArray but got: {}", t
+                )))
+            )
+        }
+        _ => {
+            Err(ArrowError::ComputeError(
+                "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+            ))
+        }
+    }
+}
+
+/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn nlike_dict<K: ArrowNumericType>(
+    left: &DictionaryArray<K>,
+    right: &DictionaryArray<K>,
+) -> Result<BooleanArray> {
+    match (left.value_type(), right.value_type()) {
+        (DataType::Utf8, DataType::Utf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+            regex_like(left, right, true, |re_pattern| {
+                Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from LIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+            regex_like(left, right, true, |re_pattern| {
+                Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from LIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        _ => Err(ArrowError::ComputeError(
+            "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+                .to_string(),
+        )),
+    }
+}
+
 #[inline]
 fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
     left: L,
@@ -497,6 +655,85 @@ pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
     })
 }
 
+/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
+    match (left.data_type(), right.data_type()) {
+        (DataType::Utf8, DataType::Utf8)  => {
+            let left = as_string_array(left);
+            let right = as_string_array(right);
+            ilike_utf8(left, right)
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = as_largestring_array(left);
+            let right = as_largestring_array(right);
+            ilike_utf8(left, right)
+        }
+        #[cfg(feature = "dyn_cmp_dict")]
+        (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+            downcast_dictionary_array!(
+                left => {
+                    let right = as_dictionary_array(right);
+                    ilike_dict(left, right)
+                }
+                t => Err(ArrowError::ComputeError(format!(
+                    "Should be DictionaryArray but got: {}", t
+                )))
+            )
+        }
+        _ => {
+            Err(ArrowError::ComputeError(
+                "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+            ))
+        }
+    }
+}
+
+/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn ilike_dict<K: ArrowNumericType>(
+    left: &DictionaryArray<K>,
+    right: &DictionaryArray<K>,
+) -> Result<BooleanArray> {
+    match (left.value_type(), right.value_type()) {
+        (DataType::Utf8, DataType::Utf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+            regex_like(left, right, false, |re_pattern| {
+                Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from ILIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+            regex_like(left, right, false, |re_pattern| {
+                Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from ILIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        _ => Err(ArrowError::ComputeError(
+            "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+                .to_string(),
+        )),
+    }
+}
+
 #[inline]
 fn ilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
     left: L,
@@ -616,7 +853,7 @@ pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result<BooleanArr
         }
         _ => {
             Err(ArrowError::ComputeError(
-                "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+                "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
             ))
         }
     }
@@ -676,6 +913,85 @@ pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
     })
 }
 
+/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
+    match (left.data_type(), right.data_type()) {
+        (DataType::Utf8, DataType::Utf8)  => {
+            let left = as_string_array(left);
+            let right = as_string_array(right);
+            nilike_utf8(left, right)
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = as_largestring_array(left);
+            let right = as_largestring_array(right);
+            nilike_utf8(left, right)
+        }
+        #[cfg(feature = "dyn_cmp_dict")]
+        (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+            downcast_dictionary_array!(
+                left => {
+                    let right = as_dictionary_array(right);
+                    nilike_dict(left, right)
+                }
+                t => Err(ArrowError::ComputeError(format!(
+                    "Should be DictionaryArray but got: {}", t
+                )))
+            )
+        }
+        _ => {
+            Err(ArrowError::ComputeError(
+                "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+            ))
+        }
+    }
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn nilike_dict<K: ArrowNumericType>(
+    left: &DictionaryArray<K>,
+    right: &DictionaryArray<K>,
+) -> Result<BooleanArray> {
+    match (left.value_type(), right.value_type()) {
+        (DataType::Utf8, DataType::Utf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+            regex_like(left, right, true, |re_pattern| {
+                Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from ILIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        (DataType::LargeUtf8, DataType::LargeUtf8) => {
+            let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+            let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+            regex_like(left, right, true, |re_pattern| {
+                Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+                    ArrowError::ComputeError(format!(
+                        "Unable to build regex from ILIKE pattern: {}",
+                        e
+                    ))
+                })
+            })
+        }
+        _ => Err(ArrowError::ComputeError(
+            "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+                .to_string(),
+        )),
+    }
+}
+
 #[inline]
 fn nilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
     left: L,
@@ -4451,6 +4767,24 @@ mod tests {
         };
     }
 
+    macro_rules! test_dict_utf8 {
+        ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
+            #[test]
+            #[cfg(feature = "dyn_cmp_dict")]
+            fn $test_name() {
+                let left: DictionaryArray<Int8Type> = $left.into_iter().collect();
+                let right: DictionaryArray<Int8Type> = $right.into_iter().collect();
+                let res = $op(&left, &right).unwrap();
+                let expected = $expected;
+                assert_eq!(expected.len(), res.len());
+                for i in 0..res.len() {
+                    let v = res.value(i);
+                    assert_eq!(v, expected[i]);
+                }
+            }
+        };
+    }
+
     #[test]
     fn test_utf8_eq_scalar_on_slice() {
         let a = StringArray::from(
@@ -4599,6 +4933,14 @@ mod tests {
         vec![true, true, true, false, false, true, false, false]
     );
 
+    test_dict_utf8!(
+        test_utf8_array_like_dict,
+        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"],
+        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
+        like_dyn,
+        vec![true, true, true, false, false, true, false, false]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_like_scalar_escape_testing,
         test_utf8_array_like_scalar_dyn_escape_testing,
@@ -4707,6 +5049,14 @@ mod tests {
         vec![true]
     );
 
+    test_dict_utf8!(
+        test_utf8_scalar_ilike_regex_dict,
+        vec!["%%%"],
+        vec![r#"\%_\%"#],
+        ilike_dyn,
+        vec![true]
+    );
+
     #[test]
     fn test_replace_like_wildcards() {
         let a_eq = "_%";
@@ -4757,6 +5107,15 @@ mod tests {
         nlike_utf8,
         vec![false, false, false, true, true, false, true]
     );
+
+    test_dict_utf8!(
+        test_utf8_array_nlike_dict,
+        vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+        vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
+        nlike_dyn,
+        vec![false, false, false, true, true, false, true]
+    );
+
     test_utf8_scalar!(
         test_utf8_array_nlike_escape_testing,
         test_utf8_array_nlike_escape_dyn_testing_dyn,
@@ -4844,6 +5203,14 @@ mod tests {
         vec![true, true, true, false, false, true, false]
     );
 
+    test_dict_utf8!(
+        test_utf8_array_ilike_dict,
+        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+        ilike_dyn,
+        vec![true, true, true, false, false, true, false]
+    );
+
     test_utf8_scalar!(
         ilike_utf8_scalar_escape_testing,
         ilike_utf8_scalar_escape_dyn_testing,
@@ -4912,6 +5279,14 @@ mod tests {
         vec![false, false, false, true, true, false, true]
     );
 
+    test_dict_utf8!(
+        test_utf8_array_nilike_dict,
+        vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+        vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+        nilike_dyn,
+        vec![false, false, false, true, true, false, true]
+    );
+
     test_utf8_scalar!(
         nilike_utf8_scalar_escape_testing,
         nilike_utf8_scalar_escape_dyn_testing,