You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2024/02/03 10:56:15 UTC

(arrow-datafusion) branch main updated: Cleanup regex_expressions.rs to remove _regexp_match function (#9107)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 9d1502bce0 Cleanup regex_expressions.rs to remove _regexp_match function  (#9107)
9d1502bce0 is described below

commit 9d1502bce08172a3501524fb3ef4754af3cf98b6
Author: Bruce Ritchie <br...@veeva.com>
AuthorDate: Sat Feb 3 05:56:10 2024 -0500

    Cleanup regex_expressions.rs to remove _regexp_match function  (#9107)
    
    * Cleanup regex_expressions.rs to remove _regexp_match function #9106
    
    * Adding datafusion-cli Cargo.lock
    
    * Update datafusion/physical-expr/src/regex_expressions.rs
    
    Removed extraneous line as per code review suggestion
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    * Update datafusion/physical-expr/src/regex_expressions.rs
    
    Removed extraneous lines as per code review
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    * Make rustfmt happy again.
    
    ---------
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 Cargo.toml                                        |  1 +
 datafusion-cli/Cargo.lock                         |  1 +
 datafusion/physical-expr/Cargo.toml               |  1 +
 datafusion/physical-expr/src/regex_expressions.rs | 97 +++--------------------
 4 files changed, 13 insertions(+), 87 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 4c0e7bde26..e99618dd9f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -39,6 +39,7 @@ arrow-flight = { version = "50.0.0", features = ["flight-sql-experimental"] }
 arrow-ipc = { version = "50.0.0", default-features = false, features = ["lz4"] }
 arrow-ord = { version = "50.0.0", default-features = false }
 arrow-schema = { version = "50.0.0", default-features = false }
+arrow-string = { version = "50.0.0", default-features = false }
 async-trait = "0.1.73"
 bigdecimal = "0.4.1"
 bytes = "1.4"
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 4d5b3b711d..94c57bd770 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1264,6 +1264,7 @@ dependencies = [
  "arrow-buffer",
  "arrow-ord",
  "arrow-schema",
+ "arrow-string",
  "base64",
  "blake2",
  "blake3",
diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml
index dc3ecdb14f..029f4565c0 100644
--- a/datafusion/physical-expr/Cargo.toml
+++ b/datafusion/physical-expr/Cargo.toml
@@ -49,6 +49,7 @@ arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }
 arrow-ord = { workspace = true }
 arrow-schema = { workspace = true }
+arrow-string = { workspace = true }
 base64 = { version = "0.21", optional = true }
 blake2 = { version = "^0.10.2", optional = true }
 blake3 = { version = "1.0", optional = true }
diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs
index bdd272563e..83733da864 100644
--- a/datafusion/physical-expr/src/regex_expressions.rs
+++ b/datafusion/physical-expr/src/regex_expressions.rs
@@ -25,8 +25,7 @@ use arrow::array::{
     new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray,
     OffsetSizeTrait,
 };
-use arrow_array::builder::{GenericStringBuilder, ListBuilder};
-use arrow_schema::ArrowError;
+
 use datafusion_common::{arrow_datafusion_err, plan_err};
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, DataFusionError, Result,
@@ -61,19 +60,20 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
         2 => {
             let values = as_generic_string_array::<T>(&args[0])?;
             let regex = as_generic_string_array::<T>(&args[1])?;
-            _regexp_match(values, regex, None).map_err(|e| arrow_datafusion_err!(e))
+            arrow_string::regexp::regexp_match(values, regex, None)
+                .map_err(|e| arrow_datafusion_err!(e))
         }
         3 => {
             let values = as_generic_string_array::<T>(&args[0])?;
             let regex = as_generic_string_array::<T>(&args[1])?;
-            let flags = Some(as_generic_string_array::<T>(&args[2])?);
+            let flags = as_generic_string_array::<T>(&args[2])?;
 
-            match flags {
-                Some(f) if f.iter().any(|s| s == Some("g")) => {
-                    plan_err!("regexp_match() does not support the \"global\" option")
-                },
-                _ => _regexp_match(values, regex, flags).map_err(|e| arrow_datafusion_err!(e)),
+            if flags.iter().any(|s| s == Some("g")) {
+                return plan_err!("regexp_match() does not support the \"global\" option")
             }
+
+            arrow_string::regexp::regexp_match(values, regex, Some(flags))
+                .map_err(|e| arrow_datafusion_err!(e))
         }
         other => internal_err!(
             "regexp_match was called with {other} arguments. It requires at least 2 and at most 3."
@@ -81,83 +81,6 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
-/// TODO: Remove this once it is included in arrow-rs new release.
-/// <https://github.com/apache/arrow-rs/pull/5235>
-fn _regexp_match<OffsetSize: OffsetSizeTrait>(
-    array: &GenericStringArray<OffsetSize>,
-    regex_array: &GenericStringArray<OffsetSize>,
-    flags_array: Option<&GenericStringArray<OffsetSize>>,
-) -> std::result::Result<ArrayRef, ArrowError> {
-    let mut patterns: std::collections::HashMap<String, Regex> =
-        std::collections::HashMap::new();
-    let builder: GenericStringBuilder<OffsetSize> =
-        GenericStringBuilder::with_capacity(0, 0);
-    let mut list_builder = ListBuilder::new(builder);
-
-    let complete_pattern = match flags_array {
-        Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
-            |(pattern, flags)| {
-                pattern.map(|pattern| match flags {
-                    Some(value) => format!("(?{value}){pattern}"),
-                    None => pattern.to_string(),
-                })
-            },
-        )) as Box<dyn Iterator<Item = Option<String>>>,
-        None => Box::new(
-            regex_array
-                .iter()
-                .map(|pattern| pattern.map(|pattern| pattern.to_string())),
-        ),
-    };
-
-    array
-        .iter()
-        .zip(complete_pattern)
-        .map(|(value, pattern)| {
-            match (value, pattern) {
-                // Required for Postgres compatibility:
-                // SELECT regexp_match('foobarbequebaz', ''); = {""}
-                (Some(_), Some(pattern)) if pattern == *"" => {
-                    list_builder.values().append_value("");
-                    list_builder.append(true);
-                }
-                (Some(value), Some(pattern)) => {
-                    let existing_pattern = patterns.get(&pattern);
-                    let re = match existing_pattern {
-                        Some(re) => re,
-                        None => {
-                            let re = Regex::new(pattern.as_str()).map_err(|e| {
-                                ArrowError::ComputeError(format!(
-                                    "Regular expression did not compile: {e:?}"
-                                ))
-                            })?;
-                            patterns.insert(pattern.clone(), re);
-                            patterns.get(&pattern).unwrap()
-                        }
-                    };
-                    match re.captures(value) {
-                        Some(caps) => {
-                            let mut iter = caps.iter();
-                            if caps.len() > 1 {
-                                iter.next();
-                            }
-                            for m in iter.flatten() {
-                                list_builder.values().append_value(m.as_str());
-                            }
-
-                            list_builder.append(true);
-                        }
-                        None => list_builder.append(false),
-                    }
-                }
-                _ => list_builder.append(false),
-            }
-            Ok(())
-        })
-        .collect::<std::result::Result<Vec<()>, ArrowError>>()?;
-    Ok(Arc::new(list_builder.finish()))
-}
-
 /// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
 /// used by regexp_replace
 fn regex_replace_posix_groups(replacement: &str) -> String {
@@ -284,7 +207,7 @@ fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
     Ok(new_null_array(input_array.data_type(), input_array.len()))
 }
 
-/// Special cased regex_replace implementation for the scenerio where
+/// Special cased regex_replace implementation for the scenario where
 /// the pattern, replacement and flags are static (arrays that are derived
 /// from scalars). This means we can skip regex caching system and basically
 /// hold a single Regex object for the replace operation. This also speeds