You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/12/09 22:28:51 UTC
[arrow-rs] branch master updated: Add ASCII fast path for ILIKE scalar (90% faster) (#3306)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9e39f96b1 Add ASCII fast path for ILIKE scalar (90% faster) (#3306)
9e39f96b1 is described below
commit 9e39f96b121d88b7427295bd326d14bb78d0fb39
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri Dec 9 22:28:46 2022 +0000
Add ASCII fast path for ILIKE scalar (90% faster) (#3306)
* Add ASCII fast path for ILIKE scalar
* Update starts_with and ends_with
* Avoid allocations
* Format
* Add more tests
* Add failing test
* Verify is_ascii
---
arrow-array/src/array/byte_array.rs | 8 ++
arrow-string/src/like.rs | 207 ++++++++++++++++++++++++++----------
2 files changed, 158 insertions(+), 57 deletions(-)
diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs
index f846499ee..eb528384e 100644
--- a/arrow-array/src/array/byte_array.rs
+++ b/arrow-array/src/array/byte_array.rs
@@ -70,6 +70,14 @@ impl<T: ByteArrayType> GenericByteArray<T> {
self.data.buffers()[1].as_slice()
}
+ /// Returns true if all data within this array is ASCII
+ pub fn is_ascii(&self) -> bool {
+ let offsets = self.value_offsets();
+ let start = offsets.first().unwrap();
+ let end = offsets.last().unwrap();
+ self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
+ }
+
/// Returns the offset values in the offsets buffer
#[inline]
pub fn value_offsets(&self) -> &[T::Offset] {
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 2e0356e73..e359a80cb 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -442,7 +442,10 @@ pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
-/// See the documentation on [`like_utf8`] for more details.
+/// Case insensitive version of [`like_utf8`]
+///
+/// Note: this only implements loose matching as defined by the Unicode standard. For example,
+/// the `ff` ligature is not equivalent to `FF` and `ß` is not equivalent to `SS`
pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
@@ -499,7 +502,7 @@ pub fn ilike_dyn(
/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`].
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
#[cfg(feature = "dyn_cmp_dict")]
fn ilike_dict<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
@@ -540,60 +543,55 @@ fn ilike_dict<K: ArrowPrimitiveType>(
}
#[inline]
-fn ilike_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>(
- left: L,
+fn ilike_scalar_op<O: OffsetSizeTrait, F: Fn(bool) -> bool>(
+ left: &GenericStringArray<O>,
right: &str,
op: F,
) -> Result<BooleanArray, ArrowError> {
- if !right.contains(is_like_pattern) {
- // fast path, can use equals
- let right_uppercase = right.to_uppercase();
-
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.to_uppercase() == right_uppercase)
- }))
- } else if right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use starts_with
- let start_str = &right[..right.len() - 1].to_uppercase();
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.to_uppercase().starts_with(start_str))
- }))
- } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
- // fast path, can use ends_with
- let ends_str = &right[1..].to_uppercase();
+ // If not ASCII faster to use case insensitive regex than using to_uppercase
+ if right.is_ascii() && left.is_ascii() {
+ if !right.contains(is_like_pattern) {
+ return Ok(BooleanArray::from_unary(left, |item| {
+ op(item.eq_ignore_ascii_case(right))
+ }));
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use starts_with
+ let start_str = &right[..right.len() - 1];
+ return Ok(BooleanArray::from_unary(left, |item| {
+ let end = item.len().min(start_str.len());
+ let result = item.is_char_boundary(end)
+ && start_str.eq_ignore_ascii_case(&item[..end]);
+ op(result)
+ }));
+ } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+ // fast path, can use ends_with
+ let ends_str = &right[1..];
+ return Ok(BooleanArray::from_unary(left, |item| {
+ let start = item.len().saturating_sub(ends_str.len());
+ let result = item.is_char_boundary(start)
+ && ends_str.eq_ignore_ascii_case(&item[start..]);
+ op(result)
+ }));
+ }
+ }
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.to_uppercase().ends_with(ends_str))
- }))
- } else if right.starts_with('%')
- && right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[1..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use contains
- let contains = &right[1..right.len() - 1].to_uppercase();
- Ok(BooleanArray::from_unary(left, |item| {
- op(item.to_uppercase().contains(contains))
- }))
- } else {
- let re_pattern = replace_like_wildcards(right)?;
- let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })?;
+ let re_pattern = replace_like_wildcards(right)?;
+ let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })?;
- Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item))))
- }
+ Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item))))
}
#[inline]
-fn ilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
+fn ilike_scalar<O: OffsetSizeTrait>(
+ left: &GenericStringArray<O>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
ilike_scalar_op(left, right, |x| x)
@@ -603,7 +601,7 @@ fn ilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
/// [`LargeStringArray`], or [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
pub fn ilike_utf8_scalar_dyn(
left: &dyn Array,
right: &str,
@@ -641,7 +639,7 @@ pub fn ilike_utf8_scalar_dyn(
/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`] and a scalar.
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &str,
@@ -652,7 +650,7 @@ pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`].
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
@@ -670,7 +668,7 @@ pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`].
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
pub fn nilike_dyn(
left: &dyn Array,
right: &dyn Array,
@@ -709,7 +707,7 @@ pub fn nilike_dyn(
/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`].
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
#[cfg(feature = "dyn_cmp_dict")]
fn nilike_dict<K: ArrowPrimitiveType>(
left: &DictionaryArray<K>,
@@ -750,8 +748,8 @@ fn nilike_dict<K: ArrowPrimitiveType>(
}
#[inline]
-fn nilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
+fn nilike_scalar<O: OffsetSizeTrait>(
+ left: &GenericStringArray<O>,
right: &str,
) -> Result<BooleanArray, ArrowError> {
ilike_scalar_op(left, right, |x| !x)
@@ -761,7 +759,7 @@ fn nilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
/// [`LargeStringArray`], or [`DictionaryArray`] with values
/// [`StringArray`]/[`LargeStringArray`] and a scalar.
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
pub fn nilike_utf8_scalar_dyn(
left: &dyn Array,
right: &str,
@@ -799,7 +797,7 @@ pub fn nilike_utf8_scalar_dyn(
/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
/// [`LargeStringArray`] and a scalar.
///
-/// See the documentation on [`like_utf8`] for more details.
+/// See the documentation on [`ilike_utf8`] for more details.
pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &str,
@@ -1272,6 +1270,101 @@ mod tests {
vec![true, false, false, false]
);
+ // We only implement loose matching
+ test_utf8_scalar!(
+ test_utf8_array_ilike_unicode,
+ test_utf8_array_ilike_unicode_dyn,
+ vec![
+ "FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß", "FFKoSS"
+ ],
+ "FFkoSS",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![false, true, true, false, false, false, false, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_unicode_starts,
+ test_utf8_array_ilike_unicode_start_dyn,
+ vec![
+ "FFkoßsdlkdf",
+ "FFkoSSsdlkdf",
+ "FFkosssdlkdf",
+ "FFkoS",
+ "FFkos",
+ "ffkoSS",
+ "ffkoß",
+ "FfkosSsdfd",
+ "FFKoSS",
+ ],
+ "FFkoSS%",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![false, true, true, false, false, false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_unicode_ends,
+ test_utf8_array_ilike_unicode_ends_dyn,
+ vec![
+ "sdlkdfFFkoß",
+ "sdlkdfFFkoSS",
+ "sdlkdfFFkoss",
+ "FFkoS",
+ "FFkos",
+ "ffkoSS",
+ "ffkoß",
+ "h😃klFfkosS",
+ "FFKoSS",
+ ],
+ "%FFkoSS",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![false, true, true, false, false, false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_unicode_contains,
+ test_utf8_array_ilike_unicode_contains_dyn,
+ vec![
+ "sdlkdfFkoßsdfs",
+ "sdlkdfFkoSSdggs",
+ "sdlkdfFkosssdsd",
+ "FkoS",
+ "Fkos",
+ "ffkoSS",
+ "ffkoß",
+ "😃sadlksffkosSsh😃klF",
+ "😱slgffkosSsh😃klF",
+ "FFKoSS",
+ ],
+ "%FFkoSS%",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![false, true, true, false, false, false, false, true, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_unicode_complex,
+ test_utf8_array_ilike_unicode_complex_dyn,
+ vec![
+ "sdlkdfFooßsdfs",
+ "sdlkdfFooSSdggs",
+ "sdlkdfFoosssdsd",
+ "FooS",
+ "Foos",
+ "ffooSS",
+ "ffooß",
+ "😃sadlksffofsSsh😃klF",
+ "😱slgffoesSsh😃klF",
+ "FFKoSS",
+ ],
+ "%FF__SS%",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![false, true, true, false, false, false, false, true, true, true]
+ );
+
test_utf8_scalar!(
test_utf8_array_ilike_scalar_one,
test_utf8_array_ilike_scalar_dyn_one,