You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2024/02/03 10:56:15 UTC
(arrow-datafusion) branch main updated: Cleanup regex_expressions.rs to remove _regexp_match function (#9107)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 9d1502bce0 Cleanup regex_expressions.rs to remove _regexp_match function (#9107)
9d1502bce0 is described below
commit 9d1502bce08172a3501524fb3ef4754af3cf98b6
Author: Bruce Ritchie <br...@veeva.com>
AuthorDate: Sat Feb 3 05:56:10 2024 -0500
Cleanup regex_expressions.rs to remove _regexp_match function (#9107)
* Cleanup regex_expressions.rs to remove _regexp_match function #9106
* Adding datafusion-cli Cargo.lock
* Update datafusion/physical-expr/src/regex_expressions.rs
Removed extraneous line as per code review suggestion
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
* Update datafusion/physical-expr/src/regex_expressions.rs
Removed extraneous lines as per code review
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
* Make rustfmt happy again.
---------
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
Cargo.toml | 1 +
datafusion-cli/Cargo.lock | 1 +
datafusion/physical-expr/Cargo.toml | 1 +
datafusion/physical-expr/src/regex_expressions.rs | 97 +++--------------------
4 files changed, 13 insertions(+), 87 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 4c0e7bde26..e99618dd9f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -39,6 +39,7 @@ arrow-flight = { version = "50.0.0", features = ["flight-sql-experimental"] }
arrow-ipc = { version = "50.0.0", default-features = false, features = ["lz4"] }
arrow-ord = { version = "50.0.0", default-features = false }
arrow-schema = { version = "50.0.0", default-features = false }
+arrow-string = { version = "50.0.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "0.4.1"
bytes = "1.4"
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 4d5b3b711d..94c57bd770 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1264,6 +1264,7 @@ dependencies = [
"arrow-buffer",
"arrow-ord",
"arrow-schema",
+ "arrow-string",
"base64",
"blake2",
"blake3",
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index dc3ecdb14f..029f4565c0 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -49,6 +49,7 @@ arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
arrow-ord = { workspace = true }
arrow-schema = { workspace = true }
+arrow-string = { workspace = true }
base64 = { version = "0.21", optional = true }
blake2 = { version = "^0.10.2", optional = true }
blake3 = { version = "1.0", optional = true }
diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs
index bdd272563e..83733da864 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -25,8 +25,7 @@ use arrow::array::{
new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray,
OffsetSizeTrait,
};
-use arrow_array::builder::{GenericStringBuilder, ListBuilder};
-use arrow_schema::ArrowError;
+
use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
@@ -61,19 +60,20 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
2 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
- _regexp_match(values, regex, None).map_err(|e| arrow_datafusion_err!(e))
+ arrow_string::regexp::regexp_match(values, regex, None)
+ .map_err(|e| arrow_datafusion_err!(e))
}
3 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
- let flags = Some(as_generic_string_array::<T>(&args[2])?);
+ let flags = as_generic_string_array::<T>(&args[2])?;
- match flags {
- Some(f) if f.iter().any(|s| s == Some("g")) => {
- plan_err!("regexp_match() does not support the \"global\" option")
- },
- _ => _regexp_match(values, regex, flags).map_err(|e| arrow_datafusion_err!(e)),
+ if flags.iter().any(|s| s == Some("g")) {
+ return plan_err!("regexp_match() does not support the \"global\" option")
}
+
+ arrow_string::regexp::regexp_match(values, regex, Some(flags))
+ .map_err(|e| arrow_datafusion_err!(e))
}
other => internal_err!(
"regexp_match was called with {other} arguments. It requires at least 2 and at most 3."
@@ -81,83 +81,6 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}
-/// TODO: Remove this once it is included in arrow-rs new release.
-/// <https://github.com/apache/arrow-rs/pull/5235>
-fn _regexp_match<OffsetSize: OffsetSizeTrait>(
- array: &GenericStringArray<OffsetSize>,
- regex_array: &GenericStringArray<OffsetSize>,
- flags_array: Option<&GenericStringArray<OffsetSize>>,
-) -> std::result::Result<ArrayRef, ArrowError> {
- let mut patterns: std::collections::HashMap<String, Regex> =
- std::collections::HashMap::new();
- let builder: GenericStringBuilder<OffsetSize> =
- GenericStringBuilder::with_capacity(0, 0);
- let mut list_builder = ListBuilder::new(builder);
-
- let complete_pattern = match flags_array {
- Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
- |(pattern, flags)| {
- pattern.map(|pattern| match flags {
- Some(value) => format!("(?{value}){pattern}"),
- None => pattern.to_string(),
- })
- },
- )) as Box<dyn Iterator<Item = Option<String>>>,
- None => Box::new(
- regex_array
- .iter()
- .map(|pattern| pattern.map(|pattern| pattern.to_string())),
- ),
- };
-
- array
- .iter()
- .zip(complete_pattern)
- .map(|(value, pattern)| {
- match (value, pattern) {
- // Required for Postgres compatibility:
- // SELECT regexp_match('foobarbequebaz', ''); = {""}
- (Some(_), Some(pattern)) if pattern == *"" => {
- list_builder.values().append_value("");
- list_builder.append(true);
- }
- (Some(value), Some(pattern)) => {
- let existing_pattern = patterns.get(&pattern);
- let re = match existing_pattern {
- Some(re) => re,
- None => {
- let re = Regex::new(pattern.as_str()).map_err(|e| {
- ArrowError::ComputeError(format!(
- "Regular expression did not compile: {e:?}"
- ))
- })?;
- patterns.insert(pattern.clone(), re);
- patterns.get(&pattern).unwrap()
- }
- };
- match re.captures(value) {
- Some(caps) => {
- let mut iter = caps.iter();
- if caps.len() > 1 {
- iter.next();
- }
- for m in iter.flatten() {
- list_builder.values().append_value(m.as_str());
- }
-
- list_builder.append(true);
- }
- None => list_builder.append(false),
- }
- }
- _ => list_builder.append(false),
- }
- Ok(())
- })
- .collect::<std::result::Result<Vec<()>, ArrowError>>()?;
- Ok(Arc::new(list_builder.finish()))
-}
-
/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
/// used by regexp_replace
fn regex_replace_posix_groups(replacement: &str) -> String {
@@ -284,7 +207,7 @@ fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
Ok(new_null_array(input_array.data_type(), input_array.len()))
}
-/// Special cased regex_replace implementation for the scenerio where
+/// Special cased regex_replace implementation for the scenario where
/// the pattern, replacement and flags are static (arrays that are derived
/// from scalars). This means we can skip regex caching system and basically
/// hold a single Regex object for the replace operation. This also speeds