You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/12/29 12:39:57 UTC

[arrow-datafusion] branch master updated: simplify regex expressions (#4646)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new d4934e8d1 simplify regex expressions (#4646)
d4934e8d1 is described below

commit d4934e8d19b9f32161eedb96a7a0f28a50ba51ba
Author: Marco Neumann <ma...@crepererum.net>
AuthorDate: Thu Dec 29 12:39:51 2022 +0000

    simplify regex expressions (#4646)
    
    * feat: simplify regex expressions
    
    Closes #4370.
    
    * Fix typo in constant name, add coverage
    
    * cleanups
    
    * fmt
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
 datafusion-cli/Cargo.lock                          |   1 +
 datafusion/optimizer/Cargo.toml                    |   1 +
 .../src/simplify_expressions/expr_simplifier.rs    | 182 ++++++++++++++++++++-
 .../optimizer/src/simplify_expressions/mod.rs      |   1 +
 .../optimizer/src/simplify_expressions/regex.rs    | 162 ++++++++++++++++++
 5 files changed, 338 insertions(+), 9 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index baddb4226..a5485a0cf 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -745,6 +745,7 @@ dependencies = [
  "datafusion-physical-expr",
  "hashbrown 0.13.1",
  "log",
+ "regex-syntax",
 ]
 
 [[package]]
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index e3810ba30..d0f94a7ef 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -45,6 +45,7 @@ datafusion-expr = { path = "../expr", version = "15.0.0" }
 datafusion-physical-expr = { path = "../physical-expr", version = "15.0.0" }
 hashbrown = { version = "0.13", features = ["raw"] }
 log = "^0.4"
+regex-syntax = "0.6.28"
 
 [dev-dependencies]
 ctor = "0.1.22"
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 3a51099fe..08c96168d 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -18,7 +18,9 @@
 //! Expression simplification API
 
 use super::utils::*;
-use crate::type_coercion::TypeCoercionRewriter;
+use crate::{
+    simplify_expressions::regex::simplify_regex_expr, type_coercion::TypeCoercionRewriter,
+};
 use arrow::{
     array::new_null_array,
     datatypes::{DataType, Field, Schema},
@@ -332,7 +334,10 @@ impl<'a, S> Simplifier<'a, S> {
 impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
     /// rewrite the expression simplifying any constant expressions
     fn mutate(&mut self, expr: Expr) -> Result<Expr> {
-        use datafusion_expr::Operator::{And, Divide, Eq, Modulo, Multiply, NotEq, Or};
+        use datafusion_expr::Operator::{
+            And, Divide, Eq, Modulo, Multiply, NotEq, Or, RegexIMatch, RegexMatch,
+            RegexNotIMatch, RegexNotMatch,
+        };
 
         let info = self.info;
         let new_expr = match expr {
@@ -779,10 +784,18 @@ impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
                     )
                 }
             }
-            expr => {
-                // no additional rewrites possible
-                expr
-            }
+
+            //
+            // Rules for regexes
+            //
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
+                right,
+            }) => simplify_regex_expr(left, op, right)?,
+
+            // no additional rewrites possible
+            expr => expr,
         };
         Ok(new_expr)
     }
@@ -803,7 +816,7 @@ mod tests {
         datatypes::{DataType, Field, Schema},
     };
     use chrono::{DateTime, TimeZone, Utc};
-    use datafusion_common::{cast::as_int32_array, DFField, ToDFSchema};
+    use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema};
     use datafusion_expr::*;
     use datafusion_physical_expr::{
         execution_props::ExecutionProps, functions::make_scalar_function,
@@ -1576,17 +1589,168 @@ mod tests {
         assert_eq!(simplify(expr), expected)
     }
 
+    #[test]
+    fn test_simplify_regex() {
+        // malformed regex
+        assert_contains!(
+            try_simplify(regex_match(col("c1"), lit("foo{")))
+                .unwrap_err()
+                .to_string(),
+            "regex parse error"
+        );
+
+        // unsupported cases
+        assert_no_change(regex_match(col("c1"), lit("foo.*")));
+        assert_no_change(regex_match(col("c1"), lit("(foo)")));
+        assert_no_change(regex_match(col("c1"), lit("^foo")));
+        assert_no_change(regex_match(col("c1"), lit("foo$")));
+        assert_no_change(regex_match(col("c1"), lit("%")));
+        assert_no_change(regex_match(col("c1"), lit("_")));
+        assert_no_change(regex_match(col("c1"), lit("f%o")));
+        assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+        // empty cases
+        assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+        assert_change(
+            regex_not_match(col("c1"), lit("")),
+            not_like(col("c1"), "%"),
+        );
+        assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+        assert_change(
+            regex_not_imatch(col("c1"), lit("")),
+            not_ilike(col("c1"), "%"),
+        );
+
+        // single character
+        assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
+
+        // single word
+        assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%"));
+
+        // OR-chain
+        assert_change(
+            regex_match(col("c1"), lit("foo|bar|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%bar%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo|x|baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%x%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_match(col("c1"), lit("foo||baz")),
+            like(col("c1"), "%foo%")
+                .or(like(col("c1"), "%"))
+                .or(like(col("c1"), "%baz%")),
+        );
+        assert_change(
+            regex_not_match(col("c1"), lit("foo|bar|baz")),
+            not_like(col("c1"), "%foo%")
+                .and(not_like(col("c1"), "%bar%"))
+                .and(not_like(col("c1"), "%baz%")),
+        );
+        // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
+        assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc")));
+    }
+
+    #[track_caller]
+    fn assert_no_change(expr: Expr) {
+        let optimized = simplify(expr.clone());
+        assert_eq!(expr, optimized);
+    }
+
+    #[track_caller]
+    fn assert_change(expr: Expr, expected: Expr) {
+        let optimized = simplify(expr);
+        assert_eq!(optimized, expected);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotIMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: Box::new(lit(pattern)),
+            escape_char: None,
+        })
+    }
+
+    fn not_like(expr: Expr, pattern: &str) -> Expr {
+        Expr::Like(Like {
+            negated: true,
+            expr: Box::new(expr),
+            pattern: Box::new(lit(pattern)),
+            escape_char: None,
+        })
+    }
+
+    fn ilike(expr: Expr, pattern: &str) -> Expr {
+        Expr::ILike(Like {
+            negated: false,
+            expr: Box::new(expr),
+            pattern: Box::new(lit(pattern)),
+            escape_char: None,
+        })
+    }
+
+    fn not_ilike(expr: Expr, pattern: &str) -> Expr {
+        Expr::ILike(Like {
+            negated: true,
+            expr: Box::new(expr),
+            pattern: Box::new(lit(pattern)),
+            escape_char: None,
+        })
+    }
+
     // ------------------------------
     // ----- Simplifier tests -------
     // ------------------------------
 
-    fn simplify(expr: Expr) -> Expr {
+    fn try_simplify(expr: Expr) -> Result<Expr> {
         let schema = expr_test_schema();
         let execution_props = ExecutionProps::new();
         let simplifier = ExprSimplifier::new(
             SimplifyContext::new(&execution_props).with_schema(schema),
         );
-        simplifier.simplify(expr).unwrap()
+        simplifier.simplify(expr)
+    }
+
+    fn simplify(expr: Expr) -> Expr {
+        try_simplify(expr).unwrap()
     }
 
     fn expr_test_schema() -> DFSchemaRef {
diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs
index 0eb3359c6..975976e1f 100644
--- a/datafusion/optimizer/src/simplify_expressions/mod.rs
+++ b/datafusion/optimizer/src/simplify_expressions/mod.rs
@@ -17,6 +17,7 @@
 
 pub mod context;
 pub mod expr_simplifier;
+mod regex;
 pub mod simplify_exprs;
 mod utils;
 
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs
new file mode 100644
index 000000000..038e1fcce
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{BinaryExpr, Expr, Like, Operator};
+use regex_syntax::hir::{Hir, HirKind, Literal};
+
+/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
+const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
+
+pub fn simplify_regex_expr(
+    left: Box<Expr>,
+    op: Operator,
+    right: Box<Expr>,
+) -> Result<Expr, DataFusionError> {
+    let mode = OperatorMode::new(&op);
+
+    if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() {
+        match regex_syntax::Parser::new().parse(pattern) {
+            Ok(hir) => {
+                let kind = hir.kind();
+
+                if let HirKind::Alternation(alts) = kind {
+                    if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION {
+                        if let Some(expr) = lower_alt(&mode, &left, alts) {
+                            return Ok(expr);
+                        }
+                    }
+                } else if let Some(expr) = lower_simple(&mode, &left, &hir) {
+                    return Ok(expr);
+                }
+            }
+            Err(e) => {
+                // error out early since the execution may fail anyways
+                return Err(DataFusionError::Context(
+                    "Invalid regex".to_owned(),
+                    Box::new(DataFusionError::External(Box::new(e))),
+                ));
+            }
+        }
+    }
+
+    // leave untouched if optimization didn't work
+    Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+}
+
+struct OperatorMode {
+    not: bool,
+    i: bool,
+}
+
+impl OperatorMode {
+    fn new(op: &Operator) -> Self {
+        let not = match op {
+            Operator::RegexMatch | Operator::RegexIMatch => false,
+            Operator::RegexNotMatch | Operator::RegexNotIMatch => true,
+            _ => unreachable!(),
+        };
+
+        let i = match op {
+            Operator::RegexMatch | Operator::RegexNotMatch => false,
+            Operator::RegexIMatch | Operator::RegexNotIMatch => true,
+            _ => unreachable!(),
+        };
+
+        Self { not, i }
+    }
+
+    fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
+        let like = Like {
+            negated: self.not,
+            expr,
+            pattern: Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern)))),
+            escape_char: None,
+        };
+
+        if self.i {
+            Expr::ILike(like)
+        } else {
+            Expr::Like(like)
+        }
+    }
+}
+
+fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {
+    let mut s = String::with_capacity(parts.len() + 2);
+    s.push('%');
+
+    for sub in parts {
+        if let HirKind::Literal(Literal::Unicode(c)) = sub.kind() {
+            if !is_safe_for_like(*c) {
+                return None;
+            }
+            s.push(*c);
+        } else {
+            return None;
+        }
+    }
+
+    s.push('%');
+    Some(s)
+}
+
+fn is_safe_for_like(c: char) -> bool {
+    (c != '%') && (c != '_')
+}
+
+fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
+    match hir.kind() {
+        HirKind::Empty => {
+            return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
+        }
+        HirKind::Literal(Literal::Unicode(c)) if is_safe_for_like(*c) => {
+            return Some(mode.expr(Box::new(left.clone()), format!("%{c}%")));
+        }
+        HirKind::Concat(inner) => {
+            if let Some(pattern) = collect_concat_to_like_string(inner) {
+                return Some(mode.expr(Box::new(left.clone()), pattern));
+            }
+        }
+        _ => {}
+    }
+
+    None
+}
+
+fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
+    let mut accu: Option<Expr> = None;
+
+    for part in alts {
+        if let Some(expr) = lower_simple(mode, left, part) {
+            accu = match accu {
+                Some(accu) => {
+                    if mode.not {
+                        Some(accu.and(expr))
+                    } else {
+                        Some(accu.or(expr))
+                    }
+                }
+                None => Some(expr),
+            };
+        } else {
+            return None;
+        }
+    }
+
+    Some(accu.expect("at least two alts"))
+}