You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/12/08 15:07:41 UTC
[arrow-rs] branch master updated: Split out arrow-string (#2594) (#3295)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 96c7c9d06 Split out arrow-string (#2594) (#3295)
96c7c9d06 is described below
commit 96c7c9d06628ef1690035bc0a1096901adaf084c
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Dec 8 15:07:35 2022 +0000
Split out arrow-string (#2594) (#3295)
* Split out arrow-string (#2594)
* Doc
* Clippy
---
.github/workflows/arrow.yml | 4 +
.github/workflows/arrow_flight.yml | 5 +-
.github/workflows/dev_pr/labeler.yml | 9 +-
.github/workflows/integration.yml | 15 +-
.github/workflows/miri.yaml | 11 +-
.github/workflows/parquet.yml | 3 +-
Cargo.toml | 35 +-
arrow-string/Cargo.toml | 49 +
.../src}/concat_elements.rs | 11 +-
.../compute/kernels => arrow-string/src}/length.rs | 184 +-
.../kernels/mod.rs => arrow-string/src/lib.rs | 17 +-
arrow-string/src/like.rs | 2100 ++++++++++++++++++++
.../compute/kernels => arrow-string/src}/regexp.rs | 152 +-
.../kernels => arrow-string/src}/substring.rs | 73 +-
arrow/Cargo.toml | 4 +-
arrow/src/compute/kernels/comparison.rs | 2041 +------------------
arrow/src/compute/kernels/mod.rs | 5 +-
arrow/src/lib.rs | 1 +
dev/release/README.md | 1 +
19 files changed, 2541 insertions(+), 2179 deletions(-)
diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml
index 2e1c64ebe..0b47f0256 100644
--- a/.github/workflows/arrow.yml
+++ b/.github/workflows/arrow.yml
@@ -70,6 +70,8 @@ jobs:
run: cargo test -p arrow-csv --all-features
- name: Test arrow-json with all features
run: cargo test -p arrow-json --all-features
+ - name: Test arrow-string with all features
+ run: cargo test -p arrow-string --all-features
- name: Test arrow-integration-test with all features
run: cargo test -p arrow-integration-test --all-features
- name: Test arrow with default features
@@ -184,5 +186,7 @@ jobs:
run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings
- name: Clippy arrow-json with all features
run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings
+ - name: Clippy arrow-string with all features
+ run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings
- name: Clippy arrow
run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings
diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml
index ab7030b05..356c0fc0a 100644
--- a/.github/workflows/arrow_flight.yml
+++ b/.github/workflows/arrow_flight.yml
@@ -31,10 +31,11 @@ on:
- arrow-buffer/**
- arrow-cast/**
- arrow-data/**
- - arrow-schema/**
- - arrow-select/**
- arrow-flight/**
- arrow-ipc/**
+ - arrow-schema/**
+ - arrow-select/**
+ - arrow-string/**
- .github/**
jobs:
diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml
index d93932cd2..35f2a873c 100644
--- a/.github/workflows/dev_pr/labeler.yml
+++ b/.github/workflows/dev_pr/labeler.yml
@@ -16,16 +16,17 @@
# under the License.
arrow:
- - arrow/**/*
- arrow-array/**/*
- arrow-buffer/**/*
- arrow-cast/**/*
+ - arrow-csv/**/*
- arrow-data/**/*
- - arrow-schema/**/*
- - arrow-select/**/*
- arrow-ipc/**/*
- - arrow-csv/**/*
- arrow-json/**/*
+ - arrow-schema/**/*
+ - arrow-select/**/*
+ - arrow-string/**/*
+ - arrow/**/*
arrow-flight:
- arrow-flight/**/*
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 656e56a65..d23f4c071 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -24,20 +24,21 @@ on:
- master
pull_request:
paths:
- - arrow/**
+ - .github/**
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
+ - arrow-csv/**
- arrow-data/**
- - arrow-schema/**
- - arrow-select/**
+ - arrow-integration-test/**
+ - arrow-integration-testing/**
- arrow-ipc/**
- - arrow-csv/**
- arrow-json/**
- arrow-pyarrow-integration-testing/**
- - arrow-integration-test/**
- - arrow-integration-testing/**
- - .github/**
+ - arrow-schema/**
+ - arrow-select/**
+ - arrow-string/**
+ - arrow/**
jobs:
diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml
index b1f5d85fc..f9cc7df79 100644
--- a/.github/workflows/miri.yaml
+++ b/.github/workflows/miri.yaml
@@ -24,17 +24,18 @@ on:
- master
pull_request:
paths:
- - arrow/**
+ - .github/**
- arrow-array/**
- arrow-buffer/**
- arrow-cast/**
+ - arrow-csv/**
- arrow-data/**
- - arrow-schema/**
- - arrow-select/**
- arrow-ipc/**
- - arrow-csv/**
- arrow-json/**
- - .github/**
+ - arrow-schema/**
+ - arrow-select/**
+ - arrow-string/**
+ - arrow/**
jobs:
miri-checks:
diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index c5c7aac05..f7d94f857 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -36,6 +36,7 @@ on:
- arrow-ipc/**
- arrow-csv/**
- arrow-json/**
+ - arrow-string/**
- parquet/**
- .github/**
@@ -123,7 +124,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- rust: [stable]
+ rust: [ stable ]
steps:
- uses: actions/checkout@v3
- name: Setup Python
diff --git a/Cargo.toml b/Cargo.toml
index 16b4cb7f8..556b86a00 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,23 +17,24 @@
[workspace]
members = [
- "arrow",
- "arrow-array",
- "arrow-buffer",
- "arrow-cast",
- "arrow-csv",
- "arrow-data",
- "arrow-flight",
- "arrow-integration-test",
- "arrow-integration-testing",
- "arrow-ipc",
- "arrow-json",
- "arrow-schema",
- "arrow-select",
- "object_store",
- "parquet",
- "parquet_derive",
- "parquet_derive_test",
+ "arrow",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-csv",
+ "arrow-data",
+ "arrow-flight",
+ "arrow-integration-test",
+ "arrow-integration-testing",
+ "arrow-ipc",
+ "arrow-json",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+ "object_store",
+ "parquet",
+ "parquet_derive",
+ "parquet_derive_test",
]
# Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built
#
diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml
new file mode 100644
index 000000000..97c4b5ffb
--- /dev/null
+++ b/arrow-string/Cargo.toml
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "arrow-string"
+version = "28.0.0"
+description = "String kernels for arrow arrays"
+homepage = "https://github.com/apache/arrow-rs"
+repository = "https://github.com/apache/arrow-rs"
+authors = ["Apache Arrow <de...@arrow.apache.org>"]
+license = "Apache-2.0"
+keywords = ["arrow"]
+include = [
+ "benches/*.rs",
+ "src/**/*.rs",
+ "Cargo.toml",
+]
+edition = "2021"
+rust-version = "1.62"
+
+[lib]
+name = "arrow_string"
+path = "src/lib.rs"
+bench = false
+
+[dependencies]
+arrow-buffer = { version = "28.0.0", path = "../arrow-buffer" }
+arrow-data = { version = "28.0.0", path = "../arrow-data" }
+arrow-schema = { version = "28.0.0", path = "../arrow-schema" }
+arrow-array = { version = "28.0.0", path = "../arrow-array" }
+regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
+regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] }
+
+[features]
+dyn_cmp_dict = []
diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow-string/src/concat_elements.rs
similarity index 97%
rename from arrow/src/compute/kernels/concat_elements.rs
rename to arrow-string/src/concat_elements.rs
index 25c8f60de..e9219fb2d 100644
--- a/arrow/src/compute/kernels/concat_elements.rs
+++ b/arrow-string/src/concat_elements.rs
@@ -15,9 +15,11 @@
// specific language governing permissions and limitations
// under the License.
-use crate::array::*;
-use crate::error::{ArrowError, Result};
+use arrow_array::builder::BufferBuilder;
+use arrow_array::*;
use arrow_data::bit_mask::combine_option_bitmap;
+use arrow_data::ArrayDataBuilder;
+use arrow_schema::ArrowError;
/// Returns the elementwise concatenation of a [`StringArray`].
///
@@ -36,7 +38,7 @@ use arrow_data::bit_mask::combine_option_bitmap;
pub fn concat_elements_utf8<Offset: OffsetSizeTrait>(
left: &GenericStringArray<Offset>,
right: &GenericStringArray<Offset>,
-) -> Result<GenericStringArray<Offset>> {
+) -> Result<GenericStringArray<Offset>, ArrowError> {
if left.len() != right.len() {
return Err(ArrowError::ComputeError(format!(
"Arrays must have the same length: {} != {}",
@@ -89,7 +91,7 @@ pub fn concat_elements_utf8<Offset: OffsetSizeTrait>(
/// An error will be returned if the [`StringArray`] are of different lengths
pub fn concat_elements_utf8_many<Offset: OffsetSizeTrait>(
arrays: &[&GenericStringArray<Offset>],
-) -> Result<GenericStringArray<Offset>> {
+) -> Result<GenericStringArray<Offset>, ArrowError> {
if arrays.is_empty() {
return Err(ArrowError::ComputeError(
"concat requires input of at least one array".to_string(),
@@ -158,6 +160,7 @@ pub fn concat_elements_utf8_many<Offset: OffsetSizeTrait>(
#[cfg(test)]
mod tests {
use super::*;
+ use arrow_array::StringArray;
#[test]
fn test_string_concat() {
let left = [Some("foo"), Some("bar"), None]
diff --git a/arrow/src/compute/kernels/length.rs b/arrow-string/src/length.rs
similarity index 84%
rename from arrow/src/compute/kernels/length.rs
rename to arrow-string/src/length.rs
index a68aa2bde..f7faa0a61 100644
--- a/arrow/src/compute/kernels/length.rs
+++ b/arrow-string/src/length.rs
@@ -17,12 +17,11 @@
//! Defines kernel for length of string arrays and binary arrays
-use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType};
-use crate::{
- datatypes::*,
- error::{ArrowError, Result},
-};
-
+use arrow_array::types::*;
+use arrow_array::*;
+use arrow_buffer::Buffer;
+use arrow_data::ArrayData;
+use arrow_schema::{ArrowError, DataType};
use std::sync::Arc;
macro_rules! unary_offsets {
@@ -153,7 +152,7 @@ where
/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray,
/// or DictionaryArray with above Arrays as values
/// * length of null is null.
-pub fn length(array: &dyn Array) -> Result<ArrayRef> {
+pub fn length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
match array.data_type() {
DataType::Dictionary(kt, _) => {
kernel_dict!(
@@ -189,7 +188,7 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef> {
/// or DictionaryArray with above Arrays as values
/// * bit_length of null is null.
/// * bit_length is in number of bits
-pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
+pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
match array.data_type() {
DataType::Dictionary(kt, _) => {
kernel_dict!(
@@ -220,6 +219,7 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
#[cfg(test)]
mod tests {
use super::*;
+ use arrow_array::cast::as_primitive_array;
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
@@ -245,11 +245,10 @@ mod tests {
macro_rules! length_binary_helper {
($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{
let array = GenericBinaryArray::<$offset_ty>::from($value);
- let result = $kernel(&array)?;
+ let result = $kernel(&array).unwrap();
let result = result.as_any().downcast_ref::<$result_ty>().unwrap();
let expected: $result_ty = $expected.into();
assert_eq!(expected.data(), result.data());
- Ok(())
}};
}
@@ -259,64 +258,61 @@ mod tests {
GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>(
$value,
);
- let result = length(&array)?;
+ let result = length(&array).unwrap();
let result = result.as_any().downcast_ref::<$result_ty>().unwrap();
let expected: $result_ty = $expected.into();
assert_eq!(expected.data(), result.data());
- Ok(())
}};
}
#[test]
#[cfg_attr(miri, ignore)] // running forever
- fn length_test_string() -> Result<()> {
+ fn length_test_string() {
length_cases_string()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = StringArray::from(input);
- let result = length(&array)?;
+ let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value, result.value(i));
});
- Ok(())
})
}
#[test]
#[cfg_attr(miri, ignore)] // running forever
- fn length_test_large_string() -> Result<()> {
+ fn length_test_large_string() {
length_cases_string()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
- let result = length(&array)?;
+ let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value as i64, result.value(i));
});
- Ok(())
})
}
#[test]
- fn length_test_binary() -> Result<()> {
+ fn length_test_binary() {
let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]];
let result: Vec<i32> = vec![4, 3, 2];
length_binary_helper!(i32, Int32Array, length, value, result)
}
#[test]
- fn length_test_large_binary() -> Result<()> {
+ fn length_test_large_binary() {
let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"];
let result: Vec<i64> = vec![4, 2, 3];
length_binary_helper!(i64, Int64Array, length, value, result)
}
#[test]
- fn length_test_list() -> Result<()> {
+ fn length_test_list() {
let value = vec![
Some(vec![]),
Some(vec![Some(1), Some(2), Some(4)]),
@@ -327,7 +323,7 @@ mod tests {
}
#[test]
- fn length_test_large_list() -> Result<()> {
+ fn length_test_large_list() {
let value = vec![
Some(vec![]),
Some(vec![Some(1.1), Some(2.2), Some(3.3)]),
@@ -348,28 +344,27 @@ mod tests {
}
#[test]
- fn length_null_string() -> Result<()> {
+ fn length_null_string() {
length_null_cases_string()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = StringArray::from(input);
- let result = length(&array)?;
+ let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
let expected: Int32Array = expected.into();
assert_eq!(expected.data(), result.data());
- Ok(())
})
}
#[test]
- fn length_null_large_string() -> Result<()> {
+ fn length_null_large_string() {
length_null_cases_string()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
- let result = length(&array)?;
+ let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
@@ -380,12 +375,11 @@ mod tests {
.collect::<Vec<_>>()
.into();
assert_eq!(expected.data(), result.data());
- Ok(())
})
}
#[test]
- fn length_null_binary() -> Result<()> {
+ fn length_null_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")];
let result: Vec<Option<i32>> = vec![Some(4), None, Some(2), Some(5)];
@@ -393,7 +387,7 @@ mod tests {
}
#[test]
- fn length_null_large_binary() -> Result<()> {
+ fn length_null_large_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")];
let result: Vec<Option<i64>> = vec![Some(2), None, Some(3), Some(5)];
@@ -401,7 +395,7 @@ mod tests {
}
#[test]
- fn length_null_list() -> Result<()> {
+ fn length_null_list() {
let value = vec![
Some(vec![]),
None,
@@ -413,7 +407,7 @@ mod tests {
}
#[test]
- fn length_null_large_list() -> Result<()> {
+ fn length_null_large_list() {
let value = vec![
Some(vec![]),
None,
@@ -434,31 +428,27 @@ mod tests {
/// Tests with an offset
#[test]
- fn length_offsets_string() -> Result<()> {
+ fn length_offsets_string() {
let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]);
let b = a.slice(1, 3);
- let result = length(b.as_ref())?;
+ let result = length(b.as_ref()).unwrap();
let result: &Int32Array = as_primitive_array(&result);
let expected = Int32Array::from(vec![Some(1), Some(5), None]);
assert_eq!(&expected, result);
-
- Ok(())
}
#[test]
- fn length_offsets_binary() -> Result<()> {
+ fn length_offsets_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
- let result = length(b.as_ref())?;
+ let result = length(b.as_ref()).unwrap();
let result: &Int32Array = as_primitive_array(&result);
let expected = Int32Array::from(vec![Some(1), Some(2), None]);
assert_eq!(&expected, result);
-
- Ok(())
}
fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
@@ -480,47 +470,45 @@ mod tests {
#[test]
#[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI
- fn bit_length_test_string() -> Result<()> {
+ fn bit_length_test_string() {
bit_length_cases()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = StringArray::from(input);
- let result = bit_length(&array)?;
+ let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value, result.value(i));
});
- Ok(())
})
}
#[test]
#[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI
- fn bit_length_test_large_string() -> Result<()> {
+ fn bit_length_test_large_string() {
bit_length_cases()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
- let result = bit_length(&array)?;
+ let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value as i64, result.value(i));
});
- Ok(())
})
}
#[test]
- fn bit_length_binary() -> Result<()> {
+ fn bit_length_binary() {
let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];
let expected: Vec<i32> = vec![24, 16, 40];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}
#[test]
- fn bit_length_large_binary() -> Result<()> {
+ fn bit_length_large_binary() {
let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]];
let expected: Vec<i64> = vec![32, 8, 16];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
@@ -535,28 +523,27 @@ mod tests {
}
#[test]
- fn bit_length_null_string() -> Result<()> {
+ fn bit_length_null_string() {
bit_length_null_cases()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = StringArray::from(input);
- let result = bit_length(&array)?;
+ let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
let expected: Int32Array = expected.into();
assert_eq!(expected.data(), result.data());
- Ok(())
})
}
#[test]
- fn bit_length_null_large_string() -> Result<()> {
+ fn bit_length_null_large_string() {
bit_length_null_cases()
.into_iter()
- .try_for_each(|(input, len, expected)| {
+ .for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
- let result = bit_length(&array)?;
+ let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
@@ -567,12 +554,11 @@ mod tests {
.collect::<Vec<_>>()
.into();
assert_eq!(expected.data(), result.data());
- Ok(())
})
}
#[test]
- fn bit_length_null_binary() -> Result<()> {
+ fn bit_length_null_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])];
let expected: Vec<Option<i32>> = vec![Some(24), None, Some(40), Some(16)];
@@ -580,7 +566,7 @@ mod tests {
}
#[test]
- fn bit_length_null_large_binary() -> Result<()> {
+ fn bit_length_null_large_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")];
let expected: Vec<Option<i64>> = vec![Some(24), None, Some(16), Some(32)];
@@ -597,47 +583,42 @@ mod tests {
/// Tests with an offset
#[test]
- fn bit_length_offsets_string() -> Result<()> {
+ fn bit_length_offsets_string() {
let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]);
let b = a.slice(1, 3);
- let result = bit_length(b.as_ref())?;
+ let result = bit_length(b.as_ref()).unwrap();
let result: &Int32Array = as_primitive_array(&result);
let expected = Int32Array::from(vec![Some(8), Some(40), None]);
assert_eq!(&expected, result);
-
- Ok(())
}
#[test]
- fn bit_length_offsets_binary() -> Result<()> {
+ fn bit_length_offsets_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(&[]), Some(b"world"), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
- let result = bit_length(b.as_ref())?;
+ let result = bit_length(b.as_ref()).unwrap();
let result: &Int32Array = as_primitive_array(&result);
let expected = Int32Array::from(vec![Some(0), Some(40), None]);
assert_eq!(&expected, result);
-
- Ok(())
}
#[test]
- fn length_dictionary() -> Result<()> {
- _length_dictionary::<Int8Type>()?;
- _length_dictionary::<Int16Type>()?;
- _length_dictionary::<Int32Type>()?;
- _length_dictionary::<Int64Type>()?;
- _length_dictionary::<UInt8Type>()?;
- _length_dictionary::<UInt16Type>()?;
- _length_dictionary::<UInt32Type>()?;
- _length_dictionary::<UInt64Type>()?;
- Ok(())
- }
-
- fn _length_dictionary<K: ArrowDictionaryKeyType>() -> Result<()> {
+ fn length_dictionary() {
+ _length_dictionary::<Int8Type>();
+ _length_dictionary::<Int16Type>();
+ _length_dictionary::<Int32Type>();
+ _length_dictionary::<Int64Type>();
+ _length_dictionary::<UInt8Type>();
+ _length_dictionary::<UInt16Type>();
+ _length_dictionary::<UInt32Type>();
+ _length_dictionary::<UInt64Type>();
+ }
+
+ fn _length_dictionary<K: ArrowDictionaryKeyType>() {
const TOTAL: i32 = 100;
let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"];
@@ -657,7 +638,7 @@ mod tests {
let expected: Vec<Option<i32>> =
data.iter().map(|opt| opt.map(|s| s.len() as i32)).collect();
- let res = length(&dict_array)?;
+ let res = length(&dict_array).unwrap();
let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
let actual: Vec<Option<i32>> = actual
.values()
@@ -670,24 +651,21 @@ mod tests {
for i in 0..TOTAL as usize {
assert_eq!(expected[i], actual[i],);
}
-
- Ok(())
}
#[test]
- fn bit_length_dictionary() -> Result<()> {
- _bit_length_dictionary::<Int8Type>()?;
- _bit_length_dictionary::<Int16Type>()?;
- _bit_length_dictionary::<Int32Type>()?;
- _bit_length_dictionary::<Int64Type>()?;
- _bit_length_dictionary::<UInt8Type>()?;
- _bit_length_dictionary::<UInt16Type>()?;
- _bit_length_dictionary::<UInt32Type>()?;
- _bit_length_dictionary::<UInt64Type>()?;
- Ok(())
- }
-
- fn _bit_length_dictionary<K: ArrowDictionaryKeyType>() -> Result<()> {
+ fn bit_length_dictionary() {
+ _bit_length_dictionary::<Int8Type>();
+ _bit_length_dictionary::<Int16Type>();
+ _bit_length_dictionary::<Int32Type>();
+ _bit_length_dictionary::<Int64Type>();
+ _bit_length_dictionary::<UInt8Type>();
+ _bit_length_dictionary::<UInt16Type>();
+ _bit_length_dictionary::<UInt32Type>();
+ _bit_length_dictionary::<UInt64Type>();
+ }
+
+ fn _bit_length_dictionary<K: ArrowDictionaryKeyType>() {
const TOTAL: i32 = 100;
let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"];
@@ -709,7 +687,7 @@ mod tests {
.map(|opt| opt.map(|s| (s.chars().count() * 8) as i32))
.collect();
- let res = bit_length(&dict_array)?;
+ let res = bit_length(&dict_array).unwrap();
let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
let actual: Vec<Option<i32>> = actual
.values()
@@ -722,7 +700,5 @@ mod tests {
for i in 0..TOTAL as usize {
assert_eq!(expected[i], actual[i],);
}
-
- Ok(())
}
}
diff --git a/arrow/src/compute/kernels/mod.rs b/arrow-string/src/lib.rs
similarity index 71%
copy from arrow/src/compute/kernels/mod.rs
copy to arrow-string/src/lib.rs
index 0eebb7012..4bd4d2826 100644
--- a/arrow/src/compute/kernels/mod.rs
+++ b/arrow-string/src/lib.rs
@@ -15,23 +15,10 @@
// specific language governing permissions and limitations
// under the License.
-//! Computation kernels on Arrow Arrays
+//! Arrow string kernels
-pub mod aggregate;
-pub mod arithmetic;
-pub mod arity;
-pub mod bitwise;
-pub mod boolean;
-pub mod comparison;
pub mod concat_elements;
pub mod length;
-pub mod limit;
-pub mod partition;
+pub mod like;
pub mod regexp;
-pub mod sort;
pub mod substring;
-pub mod temporal;
-
-pub use arrow_cast::cast;
-pub use arrow_cast::parse as cast_utils;
-pub use arrow_select::{concat, filter, interleave, take, window, zip};
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
new file mode 100644
index 000000000..11d79676d
--- /dev/null
+++ b/arrow-string/src/like.rs
@@ -0,0 +1,2100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::builder::BooleanBufferBuilder;
+use arrow_array::cast::*;
+use arrow_array::*;
+use arrow_buffer::{bit_util, MutableBuffer};
+use arrow_data::bit_mask::combine_option_bitmap;
+use arrow_data::ArrayData;
+use arrow_schema::*;
+use regex::Regex;
+use std::collections::HashMap;
+
+/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
+///
+/// There are two wildcards supported with the LIKE operator:
+///
+/// 1. `%` - The percent sign represents zero, one, or multiple characters
+/// 2. `_` - The underscore represents a single character
+///
+/// For example:
+/// ```
+/// use arrow_array::{StringArray, BooleanArray};
+/// use arrow_string::like::like_utf8;
+///
+/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
+/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]);
+///
+/// let result = like_utf8(&strings, &patterns).unwrap();
+/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
+/// ```
+pub fn like_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ regex_like(left, right, false, |re_pattern| {
+ Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })
+ })
+}
+
+/// Perform SQL `left LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`], or [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray, ArrowError> {
+ match (left.data_type(), right.data_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = as_string_array(left);
+ let right = as_string_array(right);
+ like_utf8(left, right)
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = as_largestring_array(left);
+ let right = as_largestring_array(right);
+ like_utf8(left, right)
+ }
+ #[cfg(feature = "dyn_cmp_dict")]
+ (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+ downcast_dictionary_array!(
+ left => {
+ let right = as_dictionary_array(right);
+ like_dict(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn like_dict<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &DictionaryArray<K>,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.value_type(), right.value_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+ regex_like(left, right, false, |re_pattern| {
+ Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+ regex_like(left, right, false, |re_pattern| {
+ Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ _ => Err(ArrowError::ComputeError(
+ "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+ .to_string(),
+ )),
+ }
+}
+
+#[inline]
+fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+ op: F,
+) -> Result<BooleanArray, ArrowError> {
+ if !right.contains(is_like_pattern) {
+ // fast path, can use equals
+ Ok(BooleanArray::from_unary(left, |item| op(item == right)))
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use starts_with
+ let starts_with = &right[..right.len() - 1];
+
+ Ok(BooleanArray::from_unary(left, |item| {
+ op(item.starts_with(starts_with))
+ }))
+ } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+ // fast path, can use ends_with
+ let ends_with = &right[1..];
+
+ Ok(BooleanArray::from_unary(left, |item| {
+ op(item.ends_with(ends_with))
+ }))
+ } else if right.starts_with('%')
+ && right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[1..right.len() - 1].contains(is_like_pattern)
+ {
+ let contains = &right[1..right.len() - 1];
+
+ Ok(BooleanArray::from_unary(left, |item| {
+ op(item.contains(contains))
+ }))
+ } else {
+ let re_pattern = replace_like_wildcards(right)?;
+ let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })?;
+
+ Ok(BooleanArray::from_unary(left, |item| op(re.is_match(item))))
+ }
+}
+
+#[inline]
+fn like_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ like_scalar_op(left, right, |x| x)
+}
+
+/// Perform SQL `left LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`], or [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn like_utf8_scalar_dyn(
+ left: &dyn Array,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.data_type() {
+ DataType::Utf8 => {
+ let left = as_string_array(left);
+ like_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = as_largestring_array(left);
+ like_scalar(left, right)
+ }
+ DataType::Dictionary(_, _) => {
+ downcast_dictionary_array!(
+ left => {
+ like_dict_scalar(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ like_scalar(left, right)
+}
+
+/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+fn like_dict_scalar<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.value_type() {
+ DataType::Utf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ like_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ like_scalar(left, right)
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
+///
+/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
+/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
+/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
+fn replace_like_wildcards(pattern: &str) -> Result<String, ArrowError> {
+ let mut result = String::new();
+ let pattern = String::from(pattern);
+ let mut chars_iter = pattern.chars().peekable();
+ while let Some(c) = chars_iter.next() {
+ if c == '\\' {
+ let next = chars_iter.peek();
+ match next {
+ Some(next) if is_like_pattern(*next) => {
+ result.push(*next);
+ // Skipping the next char as it is already appended
+ chars_iter.next();
+ }
+ _ => {
+ result.push('\\');
+ result.push('\\');
+ }
+ }
+ } else if regex_syntax::is_meta_character(c) {
+ result.push('\\');
+ result.push(c);
+ } else if c == '%' {
+ result.push_str(".*");
+ } else if c == '_' {
+ result.push('.');
+ } else {
+ result.push(c);
+ }
+ }
+ Ok(result)
+}
+
+/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })
+ })
+}
+
+/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_dyn(
+ left: &dyn Array,
+ right: &dyn Array,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.data_type(), right.data_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = as_string_array(left);
+ let right = as_string_array(right);
+ nlike_utf8(left, right)
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = as_largestring_array(left);
+ let right = as_largestring_array(right);
+ nlike_utf8(left, right)
+ }
+ #[cfg(feature = "dyn_cmp_dict")]
+ (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+ downcast_dictionary_array!(
+ left => {
+ let right = as_dictionary_array(right);
+ nlike_dict(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn nlike_dict<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &DictionaryArray<K>,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.value_type(), right.value_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from LIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ _ => Err(ArrowError::ComputeError(
+ "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+ .to_string(),
+ )),
+ }
+}
+
+#[inline]
+fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ like_scalar_op(left, right, |x| !x)
+}
+
+/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`], or [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_utf8_scalar_dyn(
+ left: &dyn Array,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.data_type() {
+ DataType::Utf8 => {
+ let left = as_string_array(left);
+ nlike_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = as_largestring_array(left);
+ nlike_scalar(left, right)
+ }
+ DataType::Dictionary(_, _) => {
+ downcast_dictionary_array!(
+ left => {
+ nlike_dict_scalar(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ nlike_scalar(left, right)
+}
+
+/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+fn nlike_dict_scalar<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.value_type() {
+ DataType::Utf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ nlike_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ nlike_scalar(left, right)
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ regex_like(left, right, false, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+}
+
+/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_dyn(
+ left: &dyn Array,
+ right: &dyn Array,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.data_type(), right.data_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = as_string_array(left);
+ let right = as_string_array(right);
+ ilike_utf8(left, right)
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = as_largestring_array(left);
+ let right = as_largestring_array(right);
+ ilike_utf8(left, right)
+ }
+ #[cfg(feature = "dyn_cmp_dict")]
+ (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+ downcast_dictionary_array!(
+ left => {
+ let right = as_dictionary_array(right);
+ ilike_dict(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn ilike_dict<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &DictionaryArray<K>,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.value_type(), right.value_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+ regex_like(left, right, false, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+ regex_like(left, right, false, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ _ => Err(ArrowError::ComputeError(
+ "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+ .to_string(),
+ )),
+ }
+}
+
+#[inline]
+fn ilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ let null_bit_buffer = left.data().null_buffer().cloned();
+ let bytes = bit_util::ceil(left.len(), 8);
+ let mut bool_buf = MutableBuffer::from_len_zeroed(bytes);
+ let bool_slice = bool_buf.as_slice_mut();
+
+ if !right.contains(is_like_pattern) {
+ // fast path, can use equals
+ let right_uppercase = right.to_uppercase();
+ for i in 0..left.len() {
+ unsafe {
+ if left.value_unchecked(i).to_uppercase() == right_uppercase {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use starts_with
+ let start_str = &right[..right.len() - 1].to_uppercase();
+ for i in 0..left.len() {
+ unsafe {
+ if left
+ .value_unchecked(i)
+ .to_uppercase()
+ .starts_with(start_str)
+ {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+ // fast path, can use ends_with
+ let ends_str = &right[1..].to_uppercase();
+
+ for i in 0..left.len() {
+ unsafe {
+ if left.value_unchecked(i).to_uppercase().ends_with(ends_str) {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else if right.starts_with('%')
+ && right.ends_with('%')
+ && !right[1..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use contains
+ let contains = &right[1..right.len() - 1].to_uppercase();
+ for i in 0..left.len() {
+ unsafe {
+ if left.value_unchecked(i).to_uppercase().contains(contains) {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else {
+ let re_pattern = replace_like_wildcards(right)?;
+ let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })?;
+
+ for i in 0..left.len() {
+ let haystack = unsafe { left.value_unchecked(i) };
+ if re.is_match(haystack) {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ };
+
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::Boolean,
+ left.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![bool_buf.into()],
+ vec![],
+ )
+ };
+ Ok(BooleanArray::from(data))
+}
+
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`], or [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_utf8_scalar_dyn(
+ left: &dyn Array,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.data_type() {
+ DataType::Utf8 => {
+ let left = as_string_array(left);
+ ilike_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = as_largestring_array(left);
+ ilike_scalar(left, right)
+ }
+ DataType::Dictionary(_, _) => {
+ downcast_dictionary_array!(
+ left => {
+ ilike_dict_scalar(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ ilike_scalar(left, right)
+}
+
+/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+fn ilike_dict_scalar<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.value_type() {
+ DataType::Utf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ ilike_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ ilike_scalar(left, right)
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &GenericStringArray<OffsetSize>,
+) -> Result<BooleanArray, ArrowError> {
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_dyn(
+ left: &dyn Array,
+ right: &dyn Array,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.data_type(), right.data_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = as_string_array(left);
+ let right = as_string_array(right);
+ nilike_utf8(left, right)
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = as_largestring_array(left);
+ let right = as_largestring_array(right);
+ nilike_utf8(left, right)
+ }
+ #[cfg(feature = "dyn_cmp_dict")]
+ (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
+ downcast_dictionary_array!(
+ left => {
+ let right = as_dictionary_array(right);
+ nilike_dict(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`].
+///
+/// See the documentation on [`like_utf8`] for more details.
+#[cfg(feature = "dyn_cmp_dict")]
+fn nilike_dict<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &DictionaryArray<K>,
+) -> Result<BooleanArray, ArrowError> {
+ match (left.value_type(), right.value_type()) {
+ (DataType::Utf8, DataType::Utf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
+
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
+
+ regex_like(left, right, true, |re_pattern| {
+ Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })
+ })
+ }
+ _ => Err(ArrowError::ComputeError(
+ "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
+ .to_string(),
+ )),
+ }
+}
+
+#[inline]
+fn nilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
+ left: L,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ let null_bit_buffer = left.data().null_buffer().cloned();
+ let bytes = bit_util::ceil(left.len(), 8);
+ let mut bool_buf = MutableBuffer::from_len_zeroed(bytes);
+ let bool_slice = bool_buf.as_slice_mut();
+
+ if !right.contains(is_like_pattern) {
+ // fast path, can use equals
+ let right_uppercase = right.to_uppercase();
+ for i in 0..left.len() {
+ unsafe {
+ if left.value_unchecked(i).to_uppercase() != right_uppercase {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else if right.ends_with('%')
+ && !right.ends_with("\\%")
+ && !right[..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use starts_with
+ let start_str = &right[..right.len() - 1].to_uppercase();
+ for i in 0..left.len() {
+ unsafe {
+ if !(left
+ .value_unchecked(i)
+ .to_uppercase()
+ .starts_with(start_str))
+ {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
+ // fast path, can use ends_with
+ let ends_str = &right[1..].to_uppercase();
+
+ for i in 0..left.len() {
+ unsafe {
+ if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else if right.starts_with('%')
+ && right.ends_with('%')
+ && !right[1..right.len() - 1].contains(is_like_pattern)
+ {
+ // fast path, can use contains
+ let contains = &right[1..right.len() - 1].to_uppercase();
+ for i in 0..left.len() {
+ unsafe {
+ if !(left.value_unchecked(i).to_uppercase().contains(contains)) {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ }
+ } else {
+ let re_pattern = replace_like_wildcards(right)?;
+ let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Unable to build regex from ILIKE pattern: {}",
+ e
+ ))
+ })?;
+
+ for i in 0..left.len() {
+ let haystack = unsafe { left.value_unchecked(i) };
+ if !re.is_match(haystack) {
+ bit_util::set_bit(bool_slice, i);
+ }
+ }
+ };
+
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::Boolean,
+ left.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![bool_buf.into()],
+ vec![],
+ )
+ };
+ Ok(BooleanArray::from(data))
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`], or [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8_scalar_dyn(
+ left: &dyn Array,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.data_type() {
+ DataType::Utf8 => {
+ let left = as_string_array(left);
+ nilike_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = as_largestring_array(left);
+ nilike_scalar(left, right)
+ }
+ DataType::Dictionary(_, _) => {
+ downcast_dictionary_array!(
+ left => {
+ nilike_dict_scalar(left, right)
+ }
+ t => Err(ArrowError::ComputeError(format!(
+ "Should be DictionaryArray but got: {}", t
+ )))
+ )
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ left: &GenericStringArray<OffsetSize>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ nilike_scalar(left, right)
+}
+
+/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values
+/// [`StringArray`]/[`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`like_utf8`] for more details.
+fn nilike_dict_scalar<K: ArrowPrimitiveType>(
+ left: &DictionaryArray<K>,
+ right: &str,
+) -> Result<BooleanArray, ArrowError> {
+ match left.value_type() {
+ DataType::Utf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
+ nilike_scalar(left, right)
+ }
+ DataType::LargeUtf8 => {
+ let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
+ nilike_scalar(left, right)
+ }
+ _ => {
+ Err(ArrowError::ComputeError(
+ "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
+ ))
+ }
+ }
+}
+
+fn is_like_pattern(c: char) -> bool {
+ c == '%' || c == '_'
+}
+
+/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`]
+///
+/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`)
+fn regex_like<'a, S: ArrayAccessor<Item = &'a str>, F>(
+ left: S,
+ right: S,
+ negate_regex: bool,
+ op: F,
+) -> Result<BooleanArray, ArrowError>
+where
+ F: Fn(&str) -> Result<Regex, ArrowError>,
+{
+ let mut map = HashMap::new();
+ if left.len() != right.len() {
+ return Err(ArrowError::ComputeError(
+ "Cannot perform comparison operation on arrays of different length"
+ .to_string(),
+ ));
+ }
+
+ let null_bit_buffer =
+ combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len());
+
+ let mut result = BooleanBufferBuilder::new(left.len());
+ for i in 0..left.len() {
+ let haystack = left.value(i);
+ let pat = right.value(i);
+ let re = if let Some(ref regex) = map.get(pat) {
+ regex
+ } else {
+ let re_pattern = replace_like_wildcards(pat)?;
+ let re = op(&re_pattern)?;
+ map.insert(pat, re);
+ map.get(pat).unwrap()
+ };
+
+ result.append(if negate_regex {
+ !re.is_match(haystack)
+ } else {
+ re.is_match(haystack)
+ });
+ }
+
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::Boolean,
+ left.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![result.finish()],
+ vec![],
+ )
+ };
+ Ok(BooleanArray::from(data))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use arrow_array::types::Int8Type;
+
+ macro_rules! test_utf8 {
+ ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
+ #[test]
+ fn $test_name() {
+ let left = StringArray::from($left);
+ let right = StringArray::from($right);
+ let res = $op(&left, &right).unwrap();
+ let expected = $expected;
+ assert_eq!(expected.len(), res.len());
+ for i in 0..res.len() {
+ let v = res.value(i);
+ assert_eq!(v, expected[i]);
+ }
+ }
+ };
+ }
+
+ macro_rules! test_dict_utf8 {
+ ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
+ #[test]
+ #[cfg(feature = "dyn_cmp_dict")]
+ fn $test_name() {
+ let left: DictionaryArray<Int8Type> = $left.into_iter().collect();
+ let right: DictionaryArray<Int8Type> = $right.into_iter().collect();
+ let res = $op(&left, &right).unwrap();
+ let expected = $expected;
+ assert_eq!(expected.len(), res.len());
+ for i in 0..res.len() {
+ let v = res.value(i);
+ assert_eq!(v, expected[i]);
+ }
+ }
+ };
+ }
+
+ macro_rules! test_utf8_scalar {
+ ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
+ #[test]
+ fn $test_name() {
+ let left = StringArray::from($left);
+ let res = $op(&left, $right).unwrap();
+ let expected = $expected;
+ assert_eq!(expected.len(), res.len());
+ for i in 0..res.len() {
+ let v = res.value(i);
+ assert_eq!(
+ v,
+ expected[i],
+ "unexpected result when comparing {} at position {} to {} ",
+ left.value(i),
+ i,
+ $right
+ );
+ }
+
+ let left = LargeStringArray::from($left);
+ let res = $op(&left, $right).unwrap();
+ let expected = $expected;
+ assert_eq!(expected.len(), res.len());
+ for i in 0..res.len() {
+ let v = res.value(i);
+ assert_eq!(
+ v,
+ expected[i],
+ "unexpected result when comparing {} at position {} to {} ",
+ left.value(i),
+ i,
+ $right
+ );
+ }
+ }
+ };
+ ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => {
+ test_utf8_scalar!($test_name, $left, $right, $op, $expected);
+ test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, $expected);
+ };
+ }
+
+ test_utf8!(
+ test_utf8_array_like,
+ vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"],
+ vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
+ like_utf8,
+ vec![true, true, true, false, false, true, false, false]
+ );
+
+ test_dict_utf8!(
+ test_utf8_array_like_dict,
+ vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"],
+ vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
+ like_dyn,
+ vec![true, true, true, false, false, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_escape_testing,
+ test_utf8_array_like_scalar_dyn_escape_testing,
+ vec!["varchar(255)", "int(255)", "varchar", "int"],
+ "%(%)%",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_escape_regex,
+ test_utf8_array_like_scalar_dyn_escape_regex,
+ vec![".*", "a", "*"],
+ ".*",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_escape_regex_dot,
+ test_utf8_array_like_scalar_dyn_escape_regex_dot,
+ vec![".", "a", "*"],
+ ".",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar,
+ test_utf8_array_like_scalar_dyn,
+ vec!["arrow", "parquet", "datafusion", "flight"],
+ "%ar%",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_start,
+ test_utf8_array_like_scalar_dyn_start,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow%",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, false, true, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_end,
+ test_utf8_array_like_scalar_dyn_end,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "%arrow",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_equals,
+ test_utf8_array_like_scalar_dyn_equals,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, false, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_like_scalar_one,
+ test_utf8_array_like_scalar_dyn_one,
+ vec!["arrow", "arrows", "parrow", "arr"],
+ "arrow_",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![false, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_scalar_like_escape,
+ test_utf8_scalar_like_dyn_escape,
+ vec!["a%", "a\\x"],
+ "a\\%",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_scalar_like_escape_contains,
+ test_utf8_scalar_like_dyn_escape_contains,
+ vec!["ba%", "ba\\x"],
+ "%a\\%",
+ like_utf8_scalar,
+ like_utf8_scalar_dyn,
+ vec![true, false]
+ );
+
+ test_utf8!(
+ test_utf8_scalar_ilike_regex,
+ vec!["%%%"],
+ vec![r#"\%_\%"#],
+ ilike_utf8,
+ vec![true]
+ );
+
+ test_dict_utf8!(
+ test_utf8_scalar_ilike_regex_dict,
+ vec!["%%%"],
+ vec![r#"\%_\%"#],
+ ilike_dyn,
+ vec![true]
+ );
+
+ #[test]
+ fn test_replace_like_wildcards() {
+ let a_eq = "_%";
+ let expected = "..*";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ #[test]
+ fn test_replace_like_wildcards_leave_like_meta_chars() {
+ let a_eq = "\\%\\_";
+ let expected = "%_";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ #[test]
+ fn test_replace_like_wildcards_with_multiple_escape_chars() {
+ let a_eq = "\\\\%";
+ let expected = "\\\\%";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ #[test]
+ fn test_replace_like_wildcards_escape_regex_meta_char() {
+ let a_eq = ".";
+ let expected = "\\.";
+ assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
+ }
+
+ test_utf8!(
+ test_utf8_array_nlike,
+ vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+ vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
+ nlike_utf8,
+ vec![false, false, false, true, true, false, true]
+ );
+
+ test_dict_utf8!(
+ test_utf8_array_nlike_dict,
+ vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
+ vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
+ nlike_dyn,
+ vec![false, false, false, true, true, false, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_escape_testing,
+ test_utf8_array_nlike_escape_dyn_testing_dyn,
+ vec!["varchar(255)", "int(255)", "varchar", "int"],
+ "%(%)%",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar_escape_regex,
+ test_utf8_array_nlike_scalar_dyn_escape_regex,
+ vec![".*", "a", "*"],
+ ".*",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar_escape_regex_dot,
+ test_utf8_array_nlike_scalar_dyn_escape_regex_dot,
+ vec![".", "a", "*"],
+ ".",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, true, true]
+ );
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar,
+ test_utf8_array_nlike_scalar_dyn,
+ vec!["arrow", "parquet", "datafusion", "flight"],
+ "%ar%",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar_start,
+ test_utf8_array_nlike_scalar_dyn_start,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow%",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, true, false, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar_end,
+ test_utf8_array_nlike_scalar_dyn_end,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "%arrow",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar_equals,
+ test_utf8_array_nlike_scalar_dyn_equals,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "arrow",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![false, true, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nlike_scalar_one,
+ test_utf8_array_nlike_scalar_dyn_one,
+ vec!["arrow", "arrows", "parrow", "arr"],
+ "arrow_",
+ nlike_utf8_scalar,
+ nlike_utf8_scalar_dyn,
+ vec![true, false, true, true]
+ );
+
+ test_utf8!(
+ test_utf8_array_ilike,
+ vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+ ilike_utf8,
+ vec![true, true, true, false, false, true, false]
+ );
+
+ test_dict_utf8!(
+ test_utf8_array_ilike_dict,
+ vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+ ilike_dyn,
+ vec![true, true, true, false, false, true, false]
+ );
+
+ test_utf8_scalar!(
+ ilike_utf8_scalar_escape_testing,
+ ilike_utf8_scalar_escape_dyn_testing,
+ vec!["varchar(255)", "int(255)", "varchar", "int"],
+ "%(%)%",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_scalar,
+ test_utf8_array_ilike_dyn_scalar,
+ vec!["arrow", "parquet", "datafusion", "flight"],
+ "%AR%",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_scalar_start,
+ test_utf8_array_ilike_scalar_dyn_start,
+ vec!["arrow", "parrow", "arrows", "ARR"],
+ "aRRow%",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![true, false, true, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_scalar_end,
+ test_utf8_array_ilike_scalar_dyn_end,
+ vec!["ArroW", "parrow", "ARRowS", "arr"],
+ "%arrow",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![true, true, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_scalar_equals,
+ test_utf8_array_ilike_scalar_dyn_equals,
+ vec!["arrow", "parrow", "arrows", "arr"],
+ "Arrow",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![true, false, false, false]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_ilike_scalar_one,
+ test_utf8_array_ilike_scalar_dyn_one,
+ vec!["arrow", "arrows", "parrow", "arr"],
+ "arrow_",
+ ilike_utf8_scalar,
+ ilike_utf8_scalar_dyn,
+ vec![false, true, false, false]
+ );
+
+ test_utf8!(
+ test_utf8_array_nilike,
+ vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+ nilike_utf8,
+ vec![false, false, false, true, true, false, true]
+ );
+
+ test_dict_utf8!(
+ test_utf8_array_nilike_dict,
+ vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
+ vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
+ nilike_dyn,
+ vec![false, false, false, true, true, false, true]
+ );
+
+ test_utf8_scalar!(
+ nilike_utf8_scalar_escape_testing,
+ nilike_utf8_scalar_escape_dyn_testing,
+ vec!["varchar(255)", "int(255)", "varchar", "int"],
+ "%(%)%",
+ nilike_utf8_scalar,
+ nilike_utf8_scalar_dyn,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar,
+ test_utf8_array_nilike_dyn_scalar,
+ vec!["arrow", "parquet", "datafusion", "flight"],
+ "%AR%",
+ nilike_utf8_scalar,
+ nilike_utf8_scalar_dyn,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_start,
+ test_utf8_array_nilike_scalar_dyn_start,
+ vec!["arrow", "parrow", "arrows", "ARR"],
+ "aRRow%",
+ nilike_utf8_scalar,
+ nilike_utf8_scalar_dyn,
+ vec![false, true, false, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_end,
+ test_utf8_array_nilike_scalar_dyn_end,
+ vec!["ArroW", "parrow", "ARRowS", "arr"],
+ "%arrow",
+ nilike_utf8_scalar,
+ nilike_utf8_scalar_dyn,
+ vec![false, false, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_equals,
+ test_utf8_array_nilike_scalar_dyn_equals,
+ vec!["arRow", "parrow", "arrows", "arr"],
+ "Arrow",
+ nilike_utf8_scalar,
+ nilike_utf8_scalar_dyn,
+ vec![false, true, true, true]
+ );
+
+ test_utf8_scalar!(
+ test_utf8_array_nilike_scalar_one,
+ test_utf8_array_nilike_scalar_dyn_one,
+ vec!["arrow", "arrows", "parrow", "arr"],
+ "arrow_",
+ nilike_utf8_scalar,
+ nilike_utf8_scalar_dyn,
+ vec![true, false, true, true]
+ );
+
+ #[test]
+ fn test_dict_like_kernels() {
+ let data = vec![
+ Some("Earth"),
+ Some("Fire"),
+ Some("Water"),
+ Some("Air"),
+ None,
+ Some("Air"),
+ ];
+
+ let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "Air").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "Air").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "%r").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "%r").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+ }
+
+ #[test]
+ fn test_dict_nlike_kernels() {
+ let data = vec![
+ Some("Earth"),
+ Some("Fire"),
+ Some("Water"),
+ Some("Air"),
+ None,
+ Some("Air"),
+ ];
+
+ let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+ }
+
+ #[test]
+ fn test_dict_ilike_kernels() {
+ let data = vec![
+ Some("Earth"),
+ Some("Fire"),
+ Some("Water"),
+ Some("Air"),
+ None,
+ Some("Air"),
+ ];
+
+ let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+ }
+
+ #[test]
+ fn test_dict_nilike_kernels() {
+ let data = vec![
+ Some("Earth"),
+ Some("Fire"),
+ Some("Water"),
+ Some("Air"),
+ None,
+ Some("Air"),
+ ];
+
+ let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(true),
+ None,
+ Some(true)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(),
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+
+ assert_eq!(
+ nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(),
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(false),
+ None,
+ Some(false)
+ ]),
+ );
+ }
+}
diff --git a/arrow/src/compute/kernels/regexp.rs b/arrow-string/src/regexp.rs
similarity index 53%
rename from arrow/src/compute/kernels/regexp.rs
rename to arrow-string/src/regexp.rs
index 1c5fa1927..bb4b2b0a8 100644
--- a/arrow/src/compute/kernels/regexp.rs
+++ b/arrow-string/src/regexp.rs
@@ -18,22 +18,154 @@
//! Defines kernel to extract substrings based on a regular
//! expression of a \[Large\]StringArray
-use crate::array::{
- ArrayRef, GenericStringArray, GenericStringBuilder, ListBuilder, OffsetSizeTrait,
-};
-use crate::error::{ArrowError, Result};
+use arrow_array::builder::{BooleanBufferBuilder, GenericStringBuilder, ListBuilder};
+use arrow_array::*;
+use arrow_data::bit_mask::combine_option_bitmap;
+use arrow_data::ArrayData;
+use arrow_schema::{ArrowError, DataType};
+use regex::Regex;
use std::collections::HashMap;
-
use std::sync::Arc;
-use regex::Regex;
+/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
+/// If `regex_array` element has an empty value, the corresponding result value is always true.
+///
+/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow
+/// special search modes, such as case insensitive and multi-line mode.
+/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
+/// for more information.
+pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ regex_array: &GenericStringArray<OffsetSize>,
+ flags_array: Option<&GenericStringArray<OffsetSize>>,
+) -> Result<BooleanArray, ArrowError> {
+ if array.len() != regex_array.len() {
+ return Err(ArrowError::ComputeError(
+ "Cannot perform comparison operation on arrays of different length"
+ .to_string(),
+ ));
+ }
+ let null_bit_buffer =
+ combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len());
+
+ let mut patterns: HashMap<String, Regex> = HashMap::new();
+ let mut result = BooleanBufferBuilder::new(array.len());
+
+ let complete_pattern = match flags_array {
+ Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
+ |(pattern, flags)| {
+ pattern.map(|pattern| match flags {
+ Some(flag) => format!("(?{}){}", flag, pattern),
+ None => pattern.to_string(),
+ })
+ },
+ )) as Box<dyn Iterator<Item = Option<String>>>,
+ None => Box::new(
+ regex_array
+ .iter()
+ .map(|pattern| pattern.map(|pattern| pattern.to_string())),
+ ),
+ };
+
+ array
+ .iter()
+ .zip(complete_pattern)
+ .map(|(value, pattern)| {
+ match (value, pattern) {
+ // Required for Postgres compatibility:
+ // SELECT 'foobarbequebaz' ~ ''); = true
+ (Some(_), Some(pattern)) if pattern == *"" => {
+ result.append(true);
+ }
+ (Some(value), Some(pattern)) => {
+ let existing_pattern = patterns.get(&pattern);
+ let re = match existing_pattern {
+ Some(re) => re.clone(),
+ None => {
+ let re = Regex::new(pattern.as_str()).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Regular expression did not compile: {:?}",
+ e
+ ))
+ })?;
+ patterns.insert(pattern, re.clone());
+ re
+ }
+ };
+ result.append(re.is_match(value));
+ }
+ _ => result.append(false),
+ }
+ Ok(())
+ })
+ .collect::<Result<Vec<()>, ArrowError>>()?;
+
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::Boolean,
+ array.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![result.finish()],
+ vec![],
+ )
+ };
+ Ok(BooleanArray::from(data))
+}
+
+/// Perform SQL `array ~ regex_array` operation on [`StringArray`] /
+/// [`LargeStringArray`] and a scalar.
+///
+/// See the documentation on [`regexp_is_match_utf8`] for more details.
+pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ regex: &str,
+ flag: Option<&str>,
+) -> Result<BooleanArray, ArrowError> {
+ let null_bit_buffer = array.data().null_buffer().cloned();
+ let mut result = BooleanBufferBuilder::new(array.len());
+
+ let pattern = match flag {
+ Some(flag) => format!("(?{}){}", flag, regex),
+ None => regex.to_string(),
+ };
+ if pattern.is_empty() {
+ result.append_n(array.len(), true);
+ } else {
+ let re = Regex::new(pattern.as_str()).map_err(|e| {
+ ArrowError::ComputeError(format!(
+ "Regular expression did not compile: {:?}",
+ e
+ ))
+ })?;
+ for i in 0..array.len() {
+ let value = array.value(i);
+ result.append(re.is_match(value));
+ }
+ }
+
+ let buffer = result.finish();
+ let data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::Boolean,
+ array.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![buffer],
+ vec![],
+ )
+ };
+ Ok(BooleanArray::from(data))
+}
/// Extract all groups matched by a regular expression for a given String array.
pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
-) -> Result<ArrayRef> {
+) -> Result<ArrayRef, ArrowError> {
let mut patterns: HashMap<String, Regex> = HashMap::new();
let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
@@ -94,14 +226,14 @@ pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
}
Ok(())
})
- .collect::<Result<Vec<()>>>()?;
+ .collect::<Result<Vec<()>, ArrowError>>()?;
Ok(Arc::new(list_builder.finish()))
}
#[cfg(test)]
mod tests {
use super::*;
- use crate::array::{ListArray, StringArray};
+ use arrow_array::{ListArray, StringArray};
#[test]
fn match_single_group() {
@@ -117,7 +249,7 @@ mod tests {
let mut pattern_values = vec![r".*-(\d*)-.*"; 4];
pattern_values.push(r"(bar)(bequ1e)");
pattern_values.push("");
- let pattern = StringArray::from(pattern_values);
+ let pattern = GenericStringArray::<i32>::from(pattern_values);
let actual = regexp_match(&array, &pattern, None).unwrap();
let elem_builder: GenericStringBuilder<i32> = GenericStringBuilder::new();
let mut expected_builder = ListBuilder::new(elem_builder);
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow-string/src/substring.rs
similarity index 95%
rename from arrow/src/compute/kernels/substring.rs
rename to arrow-string/src/substring.rs
index 23cb2c19f..ece367553 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow-string/src/substring.rs
@@ -19,14 +19,12 @@
//! Supported array types:
//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
-use crate::array::DictionaryArray;
-use crate::buffer::MutableBuffer;
-use crate::datatypes::*;
-use crate::{array::*, buffer::Buffer};
-use crate::{
- datatypes::DataType,
- error::{ArrowError, Result},
-};
+use arrow_array::builder::BufferBuilder;
+use arrow_array::types::*;
+use arrow_array::*;
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
+use arrow_data::ArrayData;
+use arrow_schema::{ArrowError, DataType};
use std::cmp::Ordering;
use std::sync::Arc;
@@ -45,8 +43,8 @@ use std::sync::Arc;
///
/// # Basic usage
/// ```
-/// # use arrow::array::StringArray;
-/// # use arrow::compute::kernels::substring::substring;
+/// # use arrow_array::StringArray;
+/// # use arrow_string::substring::substring;
/// let array = StringArray::from(vec![Some("arrow"), None, Some("rust")]);
/// let result = substring(&array, 1, Some(4)).unwrap();
/// let result = result.as_any().downcast_ref::<StringArray>().unwrap();
@@ -61,13 +59,17 @@ use std::sync::Arc;
///
/// ## Example of trying to get an invalid utf-8 format substring
/// ```
-/// # use arrow::array::StringArray;
-/// # use arrow::compute::kernels::substring::substring;
+/// # use arrow_array::StringArray;
+/// # use arrow_string::substring::substring;
/// let array = StringArray::from(vec![Some("E=mc²")]);
/// let error = substring(&array, 0, Some(5)).unwrap_err().to_string();
/// assert!(error.contains("invalid utf-8 boundary"));
/// ```
-pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<ArrayRef> {
+pub fn substring(
+ array: &dyn Array,
+ start: i64,
+ length: Option<u64>,
+) -> Result<ArrayRef, ArrowError> {
macro_rules! substring_dict {
($kt: ident, $($t: ident: $gt: ident), *) => {
match $kt.as_ref() {
@@ -171,8 +173,8 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
///
/// # Basic usage
/// ```
-/// # use arrow::array::StringArray;
-/// # use arrow::compute::kernels::substring::substring_by_char;
+/// # use arrow_array::StringArray;
+/// # use arrow_string::substring::substring_by_char;
/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
@@ -181,7 +183,7 @@ pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: i64,
length: Option<u64>,
-) -> Result<GenericStringArray<OffsetSize>> {
+) -> Result<GenericStringArray<OffsetSize>, ArrowError> {
let mut vals = BufferBuilder::<u8>::new({
let offsets = array.value_offsets();
(offsets[array.len()] - offsets[0]).to_usize().unwrap()
@@ -251,7 +253,7 @@ fn binary_substring<OffsetSize: OffsetSizeTrait>(
array: &GenericBinaryArray<OffsetSize>,
start: OffsetSize,
length: Option<OffsetSize>,
-) -> Result<ArrayRef> {
+) -> Result<ArrayRef, ArrowError> {
let offsets = array.value_offsets();
let data = array.value_data();
let zero = OffsetSize::zero();
@@ -312,7 +314,7 @@ fn fixed_size_binary_substring(
old_len: i32,
start: i32,
length: Option<i32>,
-) -> Result<ArrayRef> {
+) -> Result<ArrayRef, ArrowError> {
let new_start = if start >= 0 {
start.min(old_len)
} else {
@@ -361,7 +363,7 @@ fn utf8_substring<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: Option<OffsetSize>,
-) -> Result<ArrayRef> {
+) -> Result<ArrayRef, ArrowError> {
let offsets = array.value_offsets();
let data = array.value_data();
let zero = OffsetSize::zero();
@@ -391,21 +393,23 @@ fn utf8_substring<OffsetSize: OffsetSizeTrait>(
let mut len_so_far = zero;
new_offsets.push(zero);
- offsets.windows(2).try_for_each(|pair| -> Result<()> {
- let new_start = match start.cmp(&zero) {
- Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
- Ordering::Equal => pair[0],
- Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
- };
- let new_end = match length {
- Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
- None => pair[1],
- };
- len_so_far += new_end - new_start;
- new_starts_ends.push((new_start, new_end));
- new_offsets.push(len_so_far);
- Ok(())
- })?;
+ offsets
+ .windows(2)
+ .try_for_each(|pair| -> Result<(), ArrowError> {
+ let new_start = match start.cmp(&zero) {
+ Ordering::Greater => check_char_boundary((pair[0] + start).min(pair[1]))?,
+ Ordering::Equal => pair[0],
+ Ordering::Less => check_char_boundary((pair[1] + start).max(pair[0]))?,
+ };
+ let new_end = match length {
+ Some(length) => check_char_boundary((length + new_start).min(pair[1]))?,
+ None => pair[1],
+ };
+ len_so_far += new_end - new_start;
+ new_starts_ends.push((new_start, new_end));
+ new_offsets.push(len_so_far);
+ Ok(())
+ })?;
// concatenate substrings into a buffer
let mut new_values = MutableBuffer::new(new_offsets.last().unwrap().as_usize());
@@ -439,7 +443,6 @@ fn utf8_substring<OffsetSize: OffsetSizeTrait>(
#[cfg(test)]
mod tests {
use super::*;
- use crate::datatypes::*;
/// A helper macro to generate test cases.
/// # Arguments
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 876d0d650..17f88c084 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -54,12 +54,12 @@ arrow-ipc = { version = "28.0.0", path = "../arrow-ipc", optional = true }
arrow-json = { version = "28.0.0", path = "../arrow-json", optional = true }
arrow-schema = { version = "28.0.0", path = "../arrow-schema" }
arrow-select = { version = "28.0.0", path = "../arrow-select" }
+arrow-string = { version = "28.0.0", path = "../arrow-string" }
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
num = { version = "0.4", default-features = false, features = ["std"] }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
hashbrown = { version = "0.13", default-features = false }
regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
-regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] }
packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" }
chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
comfy-table = { version = "6.0", optional = true, default-features = false }
@@ -92,7 +92,7 @@ force_validate = ["arrow-data/force_validate"]
ffi = ["bitflags"]
# Enable dyn-comparison of dictionary arrays with other arrays
# Note: this does not impact comparison against scalars
-dyn_cmp_dict = []
+dyn_cmp_dict = ["arrow-string/dyn_cmp_dict"]
# Enable dyn-arithmetic kernels for dictionary arrays
# Note: this does not impact arithmetic with scalars
dyn_arith_dict = []
diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index b672410fe..6976a68d9 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -23,1227 +23,75 @@
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.
//!
-use crate::array::*;
-use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer};
-use crate::datatypes::*;
-#[allow(unused_imports)]
-use crate::downcast_dictionary_array;
-use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
-use arrow_data::bit_mask::combine_option_bitmap;
-use arrow_select::take::take;
-use num::ToPrimitive;
-use regex::Regex;
-use std::collections::HashMap;
-
-/// Helper function to perform boolean lambda function on values from two array accessors, this
-/// version does not attempt to use SIMD.
-fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>(
- left: T,
- right: S,
- op: F,
-) -> Result<BooleanArray>
-where
- F: Fn(T::Item, S::Item) -> bool,
-{
- if left.len() != right.len() {
- return Err(ArrowError::ComputeError(
- "Cannot perform comparison operation on arrays of different length"
- .to_string(),
- ));
- }
-
- Ok(BooleanArray::from_binary(left, right, op))
-}
-
-/// Helper function to perform boolean lambda function on values from array accessor, this
-/// version does not attempt to use SIMD.
-fn compare_op_scalar<T: ArrayAccessor, F>(left: T, op: F) -> Result<BooleanArray>
-where
- F: Fn(T::Item) -> bool,
-{
- Ok(BooleanArray::from_unary(left, op))
-}
-
-/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified
-/// comparison function.
-pub fn no_simd_compare_op<T, F>(
- left: &PrimitiveArray<T>,
- right: &PrimitiveArray<T>,
- op: F,
-) -> Result<BooleanArray>
-where
- T: ArrowNumericType,
- F: Fn(T::Native, T::Native) -> bool,
-{
- compare_op(left, right, op)
-}
-
-/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using
-/// a specified comparison function.
-pub fn no_simd_compare_op_scalar<T, F>(
- left: &PrimitiveArray<T>,
- right: T::Native,
- op: F,
-) -> Result<BooleanArray>
-where
- T: ArrowNumericType,
- F: Fn(T::Native, T::Native) -> bool,
-{
- compare_op_scalar(left, |l| op(l, right))
-}
-
-fn is_like_pattern(c: char) -> bool {
- c == '%' || c == '_'
-}
-
-/// Evaluate regex `op(left)` matching `right` on [`StringArray`] / [`LargeStringArray`]
-///
-/// If `negate_regex` is true, the regex expression will be negated. (for example, with `not like`)
-fn regex_like<'a, S: ArrayAccessor<Item = &'a str>, F>(
- left: S,
- right: S,
- negate_regex: bool,
- op: F,
-) -> Result<BooleanArray>
-where
- F: Fn(&str) -> Result<Regex>,
-{
- let mut map = HashMap::new();
- if left.len() != right.len() {
- return Err(ArrowError::ComputeError(
- "Cannot perform comparison operation on arrays of different length"
- .to_string(),
- ));
- }
-
- let null_bit_buffer =
- combine_option_bitmap(&[left.data_ref(), right.data_ref()], left.len());
-
- let mut result = BooleanBufferBuilder::new(left.len());
- for i in 0..left.len() {
- let haystack = left.value(i);
- let pat = right.value(i);
- let re = if let Some(ref regex) = map.get(pat) {
- regex
- } else {
- let re_pattern = replace_like_wildcards(pat)?;
- let re = op(&re_pattern)?;
- map.insert(pat, re);
- map.get(pat).unwrap()
- };
-
- result.append(if negate_regex {
- !re.is_match(haystack)
- } else {
- re.is_match(haystack)
- });
- }
-
- let data = unsafe {
- ArrayData::new_unchecked(
- DataType::Boolean,
- left.len(),
- None,
- null_bit_buffer,
- 0,
- vec![result.finish()],
- vec![],
- )
- };
- Ok(BooleanArray::from(data))
-}
-
-/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`].
-///
-/// There are two wildcards supported with the LIKE operator:
-///
-/// 1. `%` - The percent sign represents zero, one, or multiple characters
-/// 2. `_` - The underscore represents a single character
-///
-/// For example:
-/// ```
-/// use arrow::array::{StringArray, BooleanArray};
-/// use arrow::compute::like_utf8;
-///
-/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]);
-/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A_"]);
-///
-/// let result = like_utf8(&strings, &patterns).unwrap();
-/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true]));
-/// ```
-pub fn like_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray> {
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })
- })
-}
-
-/// Perform SQL `left LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn like_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
- match (left.data_type(), right.data_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = as_string_array(left);
- let right = as_string_array(right);
- like_utf8(left, right)
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = as_largestring_array(left);
- let right = as_largestring_array(right);
- like_utf8(left, right)
- }
- #[cfg(feature = "dyn_cmp_dict")]
- (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
- downcast_dictionary_array!(
- left => {
- let right = as_dictionary_array(right);
- like_dict(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "like_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left LIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-#[cfg(feature = "dyn_cmp_dict")]
-fn like_dict<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &DictionaryArray<K>,
-) -> Result<BooleanArray> {
- match (left.value_type(), right.value_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
-
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })
- })
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
-
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })
- })
- }
- _ => Err(ArrowError::ComputeError(
- "like_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
- .to_string(),
- )),
- }
-}
-
-#[inline]
-fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
- op: F,
-) -> Result<BooleanArray> {
- if !right.contains(is_like_pattern) {
- // fast path, can use equals
- compare_op_scalar(left, |item| op(item == right))
- } else if right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use starts_with
- let starts_with = &right[..right.len() - 1];
-
- compare_op_scalar(left, |item| op(item.starts_with(starts_with)))
- } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
- // fast path, can use ends_with
- let ends_with = &right[1..];
-
- compare_op_scalar(left, |item| op(item.ends_with(ends_with)))
- } else if right.starts_with('%')
- && right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[1..right.len() - 1].contains(is_like_pattern)
- {
- let contains = &right[1..right.len() - 1];
-
- compare_op_scalar(left, |item| op(item.contains(contains)))
- } else {
- let re_pattern = replace_like_wildcards(right)?;
- let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })?;
-
- compare_op_scalar(left, |item| op(re.is_match(item)))
- }
-}
-
-#[inline]
-fn like_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
-) -> Result<BooleanArray> {
- like_scalar_op(left, right, |x| x)
-}
-
-/// Perform SQL `left LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn like_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result<BooleanArray> {
- match left.data_type() {
- DataType::Utf8 => {
- let left = as_string_array(left);
- like_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = as_largestring_array(left);
- like_scalar(left, right)
- }
- DataType::Dictionary(_, _) => {
- downcast_dictionary_array!(
- left => {
- like_dict_scalar(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "like_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn like_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray> {
- like_scalar(left, right)
-}
-
-/// Perform SQL `left LIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn like_dict_scalar<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- like_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- like_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "like_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
-///
-/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
-/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
-/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
-fn replace_like_wildcards(pattern: &str) -> Result<String> {
- let mut result = String::new();
- let pattern = String::from(pattern);
- let mut chars_iter = pattern.chars().peekable();
- while let Some(c) = chars_iter.next() {
- if c == '\\' {
- let next = chars_iter.peek();
- match next {
- Some(next) if is_like_pattern(*next) => {
- result.push(*next);
- // Skipping the next char as it is already appended
- chars_iter.next();
- }
- _ => {
- result.push('\\');
- result.push('\\');
- }
- }
- } else if regex_syntax::is_meta_character(c) {
- result.push('\\');
- result.push(c);
- } else if c == '%' {
- result.push_str(".*");
- } else if c == '_' {
- result.push('.');
- } else {
- result.push(c);
- }
- }
- Ok(result)
-}
-
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray> {
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })
- })
-}
-
-/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
- match (left.data_type(), right.data_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = as_string_array(left);
- let right = as_string_array(right);
- nlike_utf8(left, right)
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = as_largestring_array(left);
- let right = as_largestring_array(right);
- nlike_utf8(left, right)
- }
- #[cfg(feature = "dyn_cmp_dict")]
- (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
- downcast_dictionary_array!(
- left => {
- let right = as_dictionary_array(right);
- nlike_dict(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nlike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left NOT LIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-#[cfg(feature = "dyn_cmp_dict")]
-fn nlike_dict<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &DictionaryArray<K>,
-) -> Result<BooleanArray> {
- match (left.value_type(), right.value_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
-
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })
- })
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
-
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from LIKE pattern: {}",
- e
- ))
- })
- })
- }
- _ => Err(ArrowError::ComputeError(
- "nlike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
- .to_string(),
- )),
- }
-}
-
-#[inline]
-fn nlike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
-) -> Result<BooleanArray> {
- like_scalar_op(left, right, |x| !x)
-}
-
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result<BooleanArray> {
- match left.data_type() {
- DataType::Utf8 => {
- let left = as_string_array(left);
- nlike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = as_largestring_array(left);
- nlike_scalar(left, right)
- }
- DataType::Dictionary(_, _) => {
- downcast_dictionary_array!(
- left => {
- nlike_dict_scalar(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nlike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray> {
- nlike_scalar(left, right)
-}
-
-/// Perform SQL `left NOT LIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn nlike_dict_scalar<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- nlike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- nlike_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nlike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn ilike_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray> {
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })
- })
-}
-
-/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn ilike_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
- match (left.data_type(), right.data_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = as_string_array(left);
- let right = as_string_array(right);
- ilike_utf8(left, right)
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = as_largestring_array(left);
- let right = as_largestring_array(right);
- ilike_utf8(left, right)
- }
- #[cfg(feature = "dyn_cmp_dict")]
- (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
- downcast_dictionary_array!(
- left => {
- let right = as_dictionary_array(right);
- ilike_dict(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "ilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left ILIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-#[cfg(feature = "dyn_cmp_dict")]
-fn ilike_dict<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &DictionaryArray<K>,
-) -> Result<BooleanArray> {
- match (left.value_type(), right.value_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
-
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })
- })
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
-
- regex_like(left, right, false, |re_pattern| {
- Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })
- })
- }
- _ => Err(ArrowError::ComputeError(
- "ilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
- .to_string(),
- )),
- }
-}
-
-#[inline]
-fn ilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
-) -> Result<BooleanArray> {
- let null_bit_buffer = left.data().null_buffer().cloned();
- let bytes = bit_util::ceil(left.len(), 8);
- let mut bool_buf = MutableBuffer::from_len_zeroed(bytes);
- let bool_slice = bool_buf.as_slice_mut();
-
- if !right.contains(is_like_pattern) {
- // fast path, can use equals
- let right_uppercase = right.to_uppercase();
- for i in 0..left.len() {
- unsafe {
- if left.value_unchecked(i).to_uppercase() == right_uppercase {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else if right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use starts_with
- let start_str = &right[..right.len() - 1].to_uppercase();
- for i in 0..left.len() {
- unsafe {
- if left
- .value_unchecked(i)
- .to_uppercase()
- .starts_with(start_str)
- {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
- // fast path, can use ends_with
- let ends_str = &right[1..].to_uppercase();
-
- for i in 0..left.len() {
- unsafe {
- if left.value_unchecked(i).to_uppercase().ends_with(ends_str) {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else if right.starts_with('%')
- && right.ends_with('%')
- && !right[1..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use contains
- let contains = &right[1..right.len() - 1].to_uppercase();
- for i in 0..left.len() {
- unsafe {
- if left.value_unchecked(i).to_uppercase().contains(contains) {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else {
- let re_pattern = replace_like_wildcards(right)?;
- let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })?;
-
- for i in 0..left.len() {
- let haystack = unsafe { left.value_unchecked(i) };
- if re.is_match(haystack) {
- bit_util::set_bit(bool_slice, i);
- }
- }
- };
-
- let data = unsafe {
- ArrayData::new_unchecked(
- DataType::Boolean,
- left.len(),
- None,
- null_bit_buffer,
- 0,
- vec![bool_buf.into()],
- vec![],
- )
- };
- Ok(BooleanArray::from(data))
-}
-
-/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result<BooleanArray> {
- match left.data_type() {
- DataType::Utf8 => {
- let left = as_string_array(left);
- ilike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = as_largestring_array(left);
- ilike_scalar(left, right)
- }
- DataType::Dictionary(_, _) => {
- downcast_dictionary_array!(
- left => {
- ilike_dict_scalar(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn ilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray> {
- ilike_scalar(left, right)
-}
-
-/// Perform SQL `left ILIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn ilike_dict_scalar<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- ilike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- ilike_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "ilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nilike_utf8<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &GenericStringArray<OffsetSize>,
-) -> Result<BooleanArray> {
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })
- })
-}
-
-/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nilike_dyn(left: &dyn Array, right: &dyn Array) -> Result<BooleanArray> {
- match (left.data_type(), right.data_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = as_string_array(left);
- let right = as_string_array(right);
- nilike_utf8(left, right)
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = as_largestring_array(left);
- let right = as_largestring_array(right);
- nilike_utf8(left, right)
- }
- #[cfg(feature = "dyn_cmp_dict")]
- (DataType::Dictionary(_, _), DataType::Dictionary(_, _)) => {
- downcast_dictionary_array!(
- left => {
- let right = as_dictionary_array(right);
- nilike_dict(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nilike_dyn only supports Utf8, LargeUtf8 or DictionaryArray (with feature `dyn_cmp_dict`) with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left NOT ILIKE right` operation on on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`].
-///
-/// See the documentation on [`like_utf8`] for more details.
-#[cfg(feature = "dyn_cmp_dict")]
-fn nilike_dict<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &DictionaryArray<K>,
-) -> Result<BooleanArray> {
- match (left.value_type(), right.value_type()) {
- (DataType::Utf8, DataType::Utf8) => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i32>>().unwrap();
-
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })
- })
- }
- (DataType::LargeUtf8, DataType::LargeUtf8) => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- let right = right.downcast_dict::<GenericStringArray<i64>>().unwrap();
-
- regex_like(left, right, true, |re_pattern| {
- Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })
- })
- }
- _ => Err(ArrowError::ComputeError(
- "nilike_dict only supports DictionaryArray with Utf8 or LargeUtf8 values"
- .to_string(),
- )),
- }
-}
-
-#[inline]
-fn nilike_scalar<'a, L: ArrayAccessor<Item = &'a str>>(
- left: L,
- right: &str,
-) -> Result<BooleanArray> {
- let null_bit_buffer = left.data().null_buffer().cloned();
- let bytes = bit_util::ceil(left.len(), 8);
- let mut bool_buf = MutableBuffer::from_len_zeroed(bytes);
- let bool_slice = bool_buf.as_slice_mut();
-
- if !right.contains(is_like_pattern) {
- // fast path, can use equals
- let right_uppercase = right.to_uppercase();
- for i in 0..left.len() {
- unsafe {
- if left.value_unchecked(i).to_uppercase() != right_uppercase {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else if right.ends_with('%')
- && !right.ends_with("\\%")
- && !right[..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use starts_with
- let start_str = &right[..right.len() - 1].to_uppercase();
- for i in 0..left.len() {
- unsafe {
- if !(left
- .value_unchecked(i)
- .to_uppercase()
- .starts_with(start_str))
- {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) {
- // fast path, can use ends_with
- let ends_str = &right[1..].to_uppercase();
-
- for i in 0..left.len() {
- unsafe {
- if !(left.value_unchecked(i).to_uppercase().ends_with(ends_str)) {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else if right.starts_with('%')
- && right.ends_with('%')
- && !right[1..right.len() - 1].contains(is_like_pattern)
- {
- // fast path, can use contains
- let contains = &right[1..right.len() - 1].to_uppercase();
- for i in 0..left.len() {
- unsafe {
- if !(left.value_unchecked(i).to_uppercase().contains(contains)) {
- bit_util::set_bit(bool_slice, i);
- }
- }
- }
- } else {
- let re_pattern = replace_like_wildcards(right)?;
- let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Unable to build regex from ILIKE pattern: {}",
- e
- ))
- })?;
-
- for i in 0..left.len() {
- let haystack = unsafe { left.value_unchecked(i) };
- if !re.is_match(haystack) {
- bit_util::set_bit(bool_slice, i);
- }
- }
- };
-
- let data = unsafe {
- ArrayData::new_unchecked(
- DataType::Boolean,
- left.len(),
- None,
- null_bit_buffer,
- 0,
- vec![bool_buf.into()],
- vec![],
- )
- };
- Ok(BooleanArray::from(data))
-}
-
-/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`], or [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result<BooleanArray> {
- match left.data_type() {
- DataType::Utf8 => {
- let left = as_string_array(left);
- nilike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = as_largestring_array(left);
- nilike_scalar(left, right)
- }
- DataType::Dictionary(_, _) => {
- downcast_dictionary_array!(
- left => {
- nilike_dict_scalar(left, right)
- }
- t => Err(ArrowError::ComputeError(format!(
- "Should be DictionaryArray but got: {}", t
- )))
- )
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
-
-/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-pub fn nilike_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- left: &GenericStringArray<OffsetSize>,
- right: &str,
-) -> Result<BooleanArray> {
- nilike_scalar(left, right)
-}
-
-/// Perform SQL `left NOT ILIKE right` operation on [`DictionaryArray`] with values
-/// [`StringArray`]/[`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`like_utf8`] for more details.
-fn nilike_dict_scalar<K: ArrowNumericType>(
- left: &DictionaryArray<K>,
- right: &str,
-) -> Result<BooleanArray> {
- match left.value_type() {
- DataType::Utf8 => {
- let left = left.downcast_dict::<GenericStringArray<i32>>().unwrap();
- nilike_scalar(left, right)
- }
- DataType::LargeUtf8 => {
- let left = left.downcast_dict::<GenericStringArray<i64>>().unwrap();
- nilike_scalar(left, right)
- }
- _ => {
- Err(ArrowError::ComputeError(
- "nilike_dict_scalar only supports DictionaryArray with Utf8 or LargeUtf8 values".to_string(),
- ))
- }
- }
-}
+pub use arrow_string::like::*;
+pub use arrow_string::regexp::{regexp_is_match_utf8, regexp_is_match_utf8_scalar};
-/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
-/// If `regex_array` element has an empty value, the corresponding result value is always true.
-///
-/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow
-/// special search modes, such as case insensitive and multi-line mode.
-/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
-/// for more information.
-pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
- array: &GenericStringArray<OffsetSize>,
- regex_array: &GenericStringArray<OffsetSize>,
- flags_array: Option<&GenericStringArray<OffsetSize>>,
-) -> Result<BooleanArray> {
- if array.len() != regex_array.len() {
+use crate::array::*;
+use crate::buffer::{buffer_unary_not, Buffer, MutableBuffer};
+use crate::datatypes::*;
+#[allow(unused_imports)]
+use crate::downcast_dictionary_array;
+use crate::error::{ArrowError, Result};
+use crate::util::bit_util;
+use arrow_data::bit_mask::combine_option_bitmap;
+use arrow_select::take::take;
+use num::ToPrimitive;
+
+/// Helper function to perform boolean lambda function on values from two array accessors, this
+/// version does not attempt to use SIMD.
+fn compare_op<T: ArrayAccessor, S: ArrayAccessor, F>(
+ left: T,
+ right: S,
+ op: F,
+) -> Result<BooleanArray>
+where
+ F: Fn(T::Item, S::Item) -> bool,
+{
+ if left.len() != right.len() {
return Err(ArrowError::ComputeError(
"Cannot perform comparison operation on arrays of different length"
.to_string(),
));
}
- let null_bit_buffer =
- combine_option_bitmap(&[array.data_ref(), regex_array.data_ref()], array.len());
-
- let mut patterns: HashMap<String, Regex> = HashMap::new();
- let mut result = BooleanBufferBuilder::new(array.len());
-
- let complete_pattern = match flags_array {
- Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
- |(pattern, flags)| {
- pattern.map(|pattern| match flags {
- Some(flag) => format!("(?{}){}", flag, pattern),
- None => pattern.to_string(),
- })
- },
- )) as Box<dyn Iterator<Item = Option<String>>>,
- None => Box::new(
- regex_array
- .iter()
- .map(|pattern| pattern.map(|pattern| pattern.to_string())),
- ),
- };
-
- array
- .iter()
- .zip(complete_pattern)
- .map(|(value, pattern)| {
- match (value, pattern) {
- // Required for Postgres compatibility:
- // SELECT 'foobarbequebaz' ~ ''); = true
- (Some(_), Some(pattern)) if pattern == *"" => {
- result.append(true);
- }
- (Some(value), Some(pattern)) => {
- let existing_pattern = patterns.get(&pattern);
- let re = match existing_pattern {
- Some(re) => re.clone(),
- None => {
- let re = Regex::new(pattern.as_str()).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Regular expression did not compile: {:?}",
- e
- ))
- })?;
- patterns.insert(pattern, re.clone());
- re
- }
- };
- result.append(re.is_match(value));
- }
- _ => result.append(false),
- }
- Ok(())
- })
- .collect::<Result<Vec<()>>>()?;
- let data = unsafe {
- ArrayData::new_unchecked(
- DataType::Boolean,
- array.len(),
- None,
- null_bit_buffer,
- 0,
- vec![result.finish()],
- vec![],
- )
- };
- Ok(BooleanArray::from(data))
+ Ok(BooleanArray::from_binary(left, right, op))
}
-/// Perform SQL `array ~ regex_array` operation on [`StringArray`] /
-/// [`LargeStringArray`] and a scalar.
-///
-/// See the documentation on [`regexp_is_match_utf8`] for more details.
-pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
- array: &GenericStringArray<OffsetSize>,
- regex: &str,
- flag: Option<&str>,
-) -> Result<BooleanArray> {
- let null_bit_buffer = array.data().null_buffer().cloned();
- let mut result = BooleanBufferBuilder::new(array.len());
+/// Helper function to perform boolean lambda function on values from array accessor, this
+/// version does not attempt to use SIMD.
+fn compare_op_scalar<T: ArrayAccessor, F>(left: T, op: F) -> Result<BooleanArray>
+where
+ F: Fn(T::Item) -> bool,
+{
+ Ok(BooleanArray::from_unary(left, op))
+}
- let pattern = match flag {
- Some(flag) => format!("(?{}){}", flag, regex),
- None => regex.to_string(),
- };
- if pattern.is_empty() {
- result.append_n(array.len(), true);
- } else {
- let re = Regex::new(pattern.as_str()).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Regular expression did not compile: {:?}",
- e
- ))
- })?;
- for i in 0..array.len() {
- let value = array.value(i);
- result.append(re.is_match(value));
- }
- }
+/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified
+/// comparison function.
+pub fn no_simd_compare_op<T, F>(
+ left: &PrimitiveArray<T>,
+ right: &PrimitiveArray<T>,
+ op: F,
+) -> Result<BooleanArray>
+where
+ T: ArrowNumericType,
+ F: Fn(T::Native, T::Native) -> bool,
+{
+ compare_op(left, right, op)
+}
- let buffer = result.finish();
- let data = unsafe {
- ArrayData::new_unchecked(
- DataType::Boolean,
- array.len(),
- None,
- null_bit_buffer,
- 0,
- vec![buffer],
- vec![],
- )
- };
- Ok(BooleanArray::from(data))
+/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using
+/// a specified comparison function.
+pub fn no_simd_compare_op_scalar<T, F>(
+ left: &PrimitiveArray<T>,
+ right: T::Native,
+ op: F,
+) -> Result<BooleanArray>
+where
+ T: ArrowNumericType,
+ F: Fn(T::Native, T::Native) -> bool,
+{
+ compare_op_scalar(left, |l| op(l, right))
}
/// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`].
@@ -4727,24 +3575,6 @@ mod tests {
};
}
- macro_rules! test_dict_utf8 {
- ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
- #[test]
- #[cfg(feature = "dyn_cmp_dict")]
- fn $test_name() {
- let left: DictionaryArray<Int8Type> = $left.into_iter().collect();
- let right: DictionaryArray<Int8Type> = $right.into_iter().collect();
- let res = $op(&left, &right).unwrap();
- let expected = $expected;
- assert_eq!(expected.len(), res.len());
- for i in 0..res.len() {
- let v = res.value(i);
- assert_eq!(v, expected[i]);
- }
- }
- };
- }
-
#[test]
fn test_utf8_eq_scalar_on_slice() {
let a = StringArray::from(
@@ -4879,432 +3709,25 @@ mod tests {
left.value(i),
i,
$right
- );
- }
- }
- };
- }
-
- test_utf8!(
- test_utf8_array_like,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"],
- vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
- like_utf8,
- vec![true, true, true, false, false, true, false, false]
- );
-
- test_dict_utf8!(
- test_utf8_array_like_dict,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"],
- vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"],
- like_dyn,
- vec![true, true, true, false, false, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_escape_testing,
- test_utf8_array_like_scalar_dyn_escape_testing,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
- "%(%)%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_escape_regex,
- test_utf8_array_like_scalar_dyn_escape_regex,
- vec![".*", "a", "*"],
- ".*",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_escape_regex_dot,
- test_utf8_array_like_scalar_dyn_escape_regex_dot,
- vec![".", "a", "*"],
- ".",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar,
- test_utf8_array_like_scalar_dyn,
- vec!["arrow", "parquet", "datafusion", "flight"],
- "%ar%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_start,
- test_utf8_array_like_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "arr"],
- "arrow%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false, true, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_end,
- test_utf8_array_like_scalar_dyn_end,
- vec!["arrow", "parrow", "arrows", "arr"],
- "%arrow",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_equals,
- test_utf8_array_like_scalar_dyn_equals,
- vec!["arrow", "parrow", "arrows", "arr"],
- "arrow",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_like_scalar_one,
- test_utf8_array_like_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
- "arrow_",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![false, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_scalar_like_escape,
- test_utf8_scalar_like_dyn_escape,
- vec!["a%", "a\\x"],
- "a\\%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false]
- );
-
- test_utf8_scalar!(
- test_utf8_scalar_like_escape_contains,
- test_utf8_scalar_like_dyn_escape_contains,
- vec!["ba%", "ba\\x"],
- "%a\\%",
- like_utf8_scalar,
- like_utf8_scalar_dyn,
- vec![true, false]
- );
-
- test_utf8!(
- test_utf8_scalar_ilike_regex,
- vec!["%%%"],
- vec![r#"\%_\%"#],
- ilike_utf8,
- vec![true]
- );
-
- test_dict_utf8!(
- test_utf8_scalar_ilike_regex_dict,
- vec!["%%%"],
- vec![r#"\%_\%"#],
- ilike_dyn,
- vec![true]
- );
-
- #[test]
- fn test_replace_like_wildcards() {
- let a_eq = "_%";
- let expected = "..*";
- assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
- }
-
- #[test]
- fn test_replace_like_wildcards_leave_like_meta_chars() {
- let a_eq = "\\%\\_";
- let expected = "%_";
- assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
- }
-
- #[test]
- fn test_replace_like_wildcards_with_multiple_escape_chars() {
- let a_eq = "\\\\%";
- let expected = "\\\\%";
- assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
- }
-
- #[test]
- fn test_replace_like_wildcards_escape_regex_meta_char() {
- let a_eq = ".";
- let expected = "\\.";
- assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected);
- }
-
- test_utf8!(
- test_utf8_array_eq,
- vec!["arrow", "arrow", "arrow", "arrow"],
- vec!["arrow", "parquet", "datafusion", "flight"],
- eq_utf8,
- vec![true, false, false, false]
- );
- test_utf8_scalar!(
- test_utf8_array_eq_scalar,
- vec!["arrow", "parquet", "datafusion", "flight"],
- "arrow",
- eq_utf8_scalar,
- vec![true, false, false, false]
- );
-
- test_utf8!(
- test_utf8_array_nlike,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
- vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
- nlike_utf8,
- vec![false, false, false, true, true, false, true]
- );
-
- test_dict_utf8!(
- test_utf8_array_nlike_dict,
- vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"],
- vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"],
- nlike_dyn,
- vec![false, false, false, true, true, false, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_escape_testing,
- test_utf8_array_nlike_escape_dyn_testing_dyn,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
- "%(%)%",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar_escape_regex,
- test_utf8_array_nlike_scalar_dyn_escape_regex,
- vec![".*", "a", "*"],
- ".*",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar_escape_regex_dot,
- test_utf8_array_nlike_scalar_dyn_escape_regex_dot,
- vec![".", "a", "*"],
- ".",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, true, true]
- );
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar,
- test_utf8_array_nlike_scalar_dyn,
- vec!["arrow", "parquet", "datafusion", "flight"],
- "%ar%",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar_start,
- test_utf8_array_nlike_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "arr"],
- "arrow%",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, true, false, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar_end,
- test_utf8_array_nlike_scalar_dyn_end,
- vec!["arrow", "parrow", "arrows", "arr"],
- "%arrow",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar_equals,
- test_utf8_array_nlike_scalar_dyn_equals,
- vec!["arrow", "parrow", "arrows", "arr"],
- "arrow",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![false, true, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nlike_scalar_one,
- test_utf8_array_nlike_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
- "arrow_",
- nlike_utf8_scalar,
- nlike_utf8_scalar_dyn,
- vec![true, false, true, true]
- );
-
- test_utf8!(
- test_utf8_array_ilike,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
- vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- ilike_utf8,
- vec![true, true, true, false, false, true, false]
- );
-
- test_dict_utf8!(
- test_utf8_array_ilike_dict,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
- vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- ilike_dyn,
- vec![true, true, true, false, false, true, false]
- );
-
- test_utf8_scalar!(
- ilike_utf8_scalar_escape_testing,
- ilike_utf8_scalar_escape_dyn_testing,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
- "%(%)%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_ilike_scalar,
- test_utf8_array_ilike_dyn_scalar,
- vec!["arrow", "parquet", "datafusion", "flight"],
- "%AR%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_ilike_scalar_start,
- test_utf8_array_ilike_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "ARR"],
- "aRRow%",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, false, true, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_ilike_scalar_end,
- test_utf8_array_ilike_scalar_dyn_end,
- vec!["ArroW", "parrow", "ARRowS", "arr"],
- "%arrow",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, true, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_ilike_scalar_equals,
- test_utf8_array_ilike_scalar_dyn_equals,
- vec!["arrow", "parrow", "arrows", "arr"],
- "Arrow",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![true, false, false, false]
- );
-
- test_utf8_scalar!(
- test_utf8_array_ilike_scalar_one,
- test_utf8_array_ilike_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
- "arrow_",
- ilike_utf8_scalar,
- ilike_utf8_scalar_dyn,
- vec![false, true, false, false]
- );
+ );
+ }
+ }
+ };
+ }
test_utf8!(
- test_utf8_array_nilike,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
- vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- nilike_utf8,
- vec![false, false, false, true, true, false, true]
- );
-
- test_dict_utf8!(
- test_utf8_array_nilike_dict,
- vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"],
- vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"],
- nilike_dyn,
- vec![false, false, false, true, true, false, true]
- );
-
- test_utf8_scalar!(
- nilike_utf8_scalar_escape_testing,
- nilike_utf8_scalar_escape_dyn_testing,
- vec!["varchar(255)", "int(255)", "varchar", "int"],
- "%(%)%",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nilike_scalar,
- test_utf8_array_nilike_dyn_scalar,
+ test_utf8_array_eq,
+ vec!["arrow", "arrow", "arrow", "arrow"],
vec!["arrow", "parquet", "datafusion", "flight"],
- "%AR%",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nilike_scalar_start,
- test_utf8_array_nilike_scalar_dyn_start,
- vec!["arrow", "parrow", "arrows", "ARR"],
- "aRRow%",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, true, false, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nilike_scalar_end,
- test_utf8_array_nilike_scalar_dyn_end,
- vec!["ArroW", "parrow", "ARRowS", "arr"],
- "%arrow",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, false, true, true]
- );
-
- test_utf8_scalar!(
- test_utf8_array_nilike_scalar_equals,
- test_utf8_array_nilike_scalar_dyn_equals,
- vec!["arRow", "parrow", "arrows", "arr"],
- "Arrow",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![false, true, true, true]
+ eq_utf8,
+ vec![true, false, false, false]
);
-
test_utf8_scalar!(
- test_utf8_array_nilike_scalar_one,
- test_utf8_array_nilike_scalar_dyn_one,
- vec!["arrow", "arrows", "parrow", "arr"],
- "arrow_",
- nilike_utf8_scalar,
- nilike_utf8_scalar_dyn,
- vec![true, false, true, true]
+ test_utf8_array_eq_scalar,
+ vec!["arrow", "parquet", "datafusion", "flight"],
+ "arrow",
+ eq_utf8_scalar,
+ vec![true, false, false, false]
);
test_utf8!(
@@ -6667,86 +5090,6 @@ mod tests {
assert_eq!(gt_eq_dyn_scalar(&array, f64::NAN).unwrap(), expected);
}
- #[test]
- fn test_dict_like_kernels() {
- let data =
- vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")];
-
- let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
-
- let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef;
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_array, "Air").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_array, "%r").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_array, "%i%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- like_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
- }
-
#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_eq_dyn_neq_dyn_dictionary_to_utf8_array() {
@@ -6959,246 +5302,6 @@ mod tests {
);
}
- #[test]
- fn test_dict_nlike_kernels() {
- let data =
- vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")];
-
- let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
-
- let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef;
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_array, "Air").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_array, "Wa%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_array, "%r").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_array, "%i%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_array, "%a%r%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- nlike_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
- }
-
- #[test]
- fn test_dict_ilike_kernels() {
- let data =
- vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")];
-
- let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
-
- let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef;
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_array, "air").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- ilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)]
- ),
- );
- }
-
- #[test]
- fn test_dict_nilike_kernels() {
- let data =
- vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")];
-
- let dict_array: DictionaryArray<Int8Type> = data.into_iter().collect();
-
- let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef;
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_array, "air").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_array, "wa%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_array, "%R").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_array, "%I%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(),
- BooleanArray::from(
- vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_array, "%A%r%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)]
- ),
- );
-
- assert_eq!(
- nilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(),
- BooleanArray::from(
- vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)]
- ),
- );
- }
-
#[test]
#[cfg(feature = "dyn_cmp_dict")]
fn test_eq_dyn_neq_dyn_dict_non_dict_float_nan() {
diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs
index 0eebb7012..29468861f 100644
--- a/arrow/src/compute/kernels/mod.rs
+++ b/arrow/src/compute/kernels/mod.rs
@@ -23,15 +23,12 @@ pub mod arity;
pub mod bitwise;
pub mod boolean;
pub mod comparison;
-pub mod concat_elements;
-pub mod length;
pub mod limit;
pub mod partition;
-pub mod regexp;
pub mod sort;
-pub mod substring;
pub mod temporal;
pub use arrow_cast::cast;
pub use arrow_cast::parse as cast_utils;
pub use arrow_select::{concat, filter, interleave, take, window, zip};
+pub use arrow_string::{concat_elements, length, regexp, substring};
diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs
index 1b2ff0684..a27e6b9af 100644
--- a/arrow/src/lib.rs
+++ b/arrow/src/lib.rs
@@ -39,6 +39,7 @@
//! * [`arrow-json`][arrow_json] - read/write JSON to arrow format
//! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays
//! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays
+//! * [`arrow-string`][arrow_string] - string kernels for arrow arrays
//!
//! _This list is likely to grow as further functionality is split out from the top-level crate_
//!
diff --git a/dev/release/README.md b/dev/release/README.md
index a3d1a8c31..0e35f80aa 100644
--- a/dev/release/README.md
+++ b/dev/release/README.md
@@ -258,6 +258,7 @@ Rust Arrow Crates:
(cd arrow-array && cargo publish)
(cd arrow-select && cargo publish)
(cd arrow-cast && cargo publish)
+(cd arrow-string && cargo publish)
(cd arrow-ipc && cargo publish)
(cd arrow-csv && cargo publish)
(cd arrow-json && cargo publish)