You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/12/29 12:39:57 UTC
[arrow-datafusion] branch master updated: simplify regex expressions (#4646)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new d4934e8d1 simplify regex expressions (#4646)
d4934e8d1 is described below
commit d4934e8d19b9f32161eedb96a7a0f28a50ba51ba
Author: Marco Neumann <ma...@crepererum.net>
AuthorDate: Thu Dec 29 12:39:51 2022 +0000
simplify regex expressions (#4646)
* feat: simplify regex expressions
Closes #4370.
* Fix typo in constant name, add coverage
* cleanups
* fmt
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
datafusion-cli/Cargo.lock | 1 +
datafusion/optimizer/Cargo.toml | 1 +
.../src/simplify_expressions/expr_simplifier.rs | 182 ++++++++++++++++++++-
.../optimizer/src/simplify_expressions/mod.rs | 1 +
.../optimizer/src/simplify_expressions/regex.rs | 162 ++++++++++++++++++
5 files changed, 338 insertions(+), 9 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index baddb4226..a5485a0cf 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -745,6 +745,7 @@ dependencies = [
"datafusion-physical-expr",
"hashbrown 0.13.1",
"log",
+ "regex-syntax",
]
[[package]]
diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
index e3810ba30..d0f94a7ef 100644
--- a/datafusion/optimizer/Cargo.toml
+++ b/datafusion/optimizer/Cargo.toml
@@ -45,6 +45,7 @@ datafusion-expr = { path = "../expr", version = "15.0.0" }
datafusion-physical-expr = { path = "../physical-expr", version = "15.0.0" }
hashbrown = { version = "0.13", features = ["raw"] }
log = "^0.4"
+regex-syntax = "0.6.28"
[dev-dependencies]
ctor = "0.1.22"
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 3a51099fe..08c96168d 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -18,7 +18,9 @@
//! Expression simplification API
use super::utils::*;
-use crate::type_coercion::TypeCoercionRewriter;
+use crate::{
+ simplify_expressions::regex::simplify_regex_expr, type_coercion::TypeCoercionRewriter,
+};
use arrow::{
array::new_null_array,
datatypes::{DataType, Field, Schema},
@@ -332,7 +334,10 @@ impl<'a, S> Simplifier<'a, S> {
impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
/// rewrite the expression simplifying any constant expressions
fn mutate(&mut self, expr: Expr) -> Result<Expr> {
- use datafusion_expr::Operator::{And, Divide, Eq, Modulo, Multiply, NotEq, Or};
+ use datafusion_expr::Operator::{
+ And, Divide, Eq, Modulo, Multiply, NotEq, Or, RegexIMatch, RegexMatch,
+ RegexNotIMatch, RegexNotMatch,
+ };
let info = self.info;
let new_expr = match expr {
@@ -779,10 +784,18 @@ impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
)
}
}
- expr => {
- // no additional rewrites possible
- expr
- }
+
+ //
+ // Rules for regexes
+ //
+ Expr::BinaryExpr(BinaryExpr {
+ left,
+ op: op @ (RegexMatch | RegexNotMatch | RegexIMatch | RegexNotIMatch),
+ right,
+ }) => simplify_regex_expr(left, op, right)?,
+
+ // no additional rewrites possible
+ expr => expr,
};
Ok(new_expr)
}
@@ -803,7 +816,7 @@ mod tests {
datatypes::{DataType, Field, Schema},
};
use chrono::{DateTime, TimeZone, Utc};
- use datafusion_common::{cast::as_int32_array, DFField, ToDFSchema};
+ use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema};
use datafusion_expr::*;
use datafusion_physical_expr::{
execution_props::ExecutionProps, functions::make_scalar_function,
@@ -1576,17 +1589,168 @@ mod tests {
assert_eq!(simplify(expr), expected)
}
+ #[test]
+ fn test_simplify_regex() {
+ // malformed regex
+ assert_contains!(
+ try_simplify(regex_match(col("c1"), lit("foo{")))
+ .unwrap_err()
+ .to_string(),
+ "regex parse error"
+ );
+
+ // unsupported cases
+ assert_no_change(regex_match(col("c1"), lit("foo.*")));
+ assert_no_change(regex_match(col("c1"), lit("(foo)")));
+ assert_no_change(regex_match(col("c1"), lit("^foo")));
+ assert_no_change(regex_match(col("c1"), lit("foo$")));
+ assert_no_change(regex_match(col("c1"), lit("%")));
+ assert_no_change(regex_match(col("c1"), lit("_")));
+ assert_no_change(regex_match(col("c1"), lit("f%o")));
+ assert_no_change(regex_match(col("c1"), lit("f_o")));
+
+ // empty cases
+ assert_change(regex_match(col("c1"), lit("")), like(col("c1"), "%"));
+ assert_change(
+ regex_not_match(col("c1"), lit("")),
+ not_like(col("c1"), "%"),
+ );
+ assert_change(regex_imatch(col("c1"), lit("")), ilike(col("c1"), "%"));
+ assert_change(
+ regex_not_imatch(col("c1"), lit("")),
+ not_ilike(col("c1"), "%"),
+ );
+
+ // single character
+ assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
+
+ // single word
+ assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%"));
+
+ // OR-chain
+ assert_change(
+ regex_match(col("c1"), lit("foo|bar|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%bar%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo|x|baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%x%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_match(col("c1"), lit("foo||baz")),
+ like(col("c1"), "%foo%")
+ .or(like(col("c1"), "%"))
+ .or(like(col("c1"), "%baz%")),
+ );
+ assert_change(
+ regex_not_match(col("c1"), lit("foo|bar|baz")),
+ not_like(col("c1"), "%foo%")
+ .and(not_like(col("c1"), "%bar%"))
+ .and(not_like(col("c1"), "%baz%")),
+ );
+ // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
+ assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc")));
+ }
+
+ #[track_caller]
+ fn assert_no_change(expr: Expr) {
+ let optimized = simplify(expr.clone());
+ assert_eq!(expr, optimized);
+ }
+
+ #[track_caller]
+ fn assert_change(expr: Expr, expected: Expr) {
+ let optimized = simplify(expr);
+ assert_eq!(optimized, expected);
+ }
+
+ fn regex_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_match(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn regex_not_imatch(left: Expr, right: Expr) -> Expr {
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(left),
+ op: Operator::RegexNotIMatch,
+ right: Box::new(right),
+ })
+ }
+
+ fn like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern: Box::new(lit(pattern)),
+ escape_char: None,
+ })
+ }
+
+ fn not_like(expr: Expr, pattern: &str) -> Expr {
+ Expr::Like(Like {
+ negated: true,
+ expr: Box::new(expr),
+ pattern: Box::new(lit(pattern)),
+ escape_char: None,
+ })
+ }
+
+ fn ilike(expr: Expr, pattern: &str) -> Expr {
+ Expr::ILike(Like {
+ negated: false,
+ expr: Box::new(expr),
+ pattern: Box::new(lit(pattern)),
+ escape_char: None,
+ })
+ }
+
+ fn not_ilike(expr: Expr, pattern: &str) -> Expr {
+ Expr::ILike(Like {
+ negated: true,
+ expr: Box::new(expr),
+ pattern: Box::new(lit(pattern)),
+ escape_char: None,
+ })
+ }
+
// ------------------------------
// ----- Simplifier tests -------
// ------------------------------
- fn simplify(expr: Expr) -> Expr {
+ fn try_simplify(expr: Expr) -> Result<Expr> {
let schema = expr_test_schema();
let execution_props = ExecutionProps::new();
let simplifier = ExprSimplifier::new(
SimplifyContext::new(&execution_props).with_schema(schema),
);
- simplifier.simplify(expr).unwrap()
+ simplifier.simplify(expr)
+ }
+
+ fn simplify(expr: Expr) -> Expr {
+ try_simplify(expr).unwrap()
}
fn expr_test_schema() -> DFSchemaRef {
diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs
index 0eb3359c6..975976e1f 100644
--- a/datafusion/optimizer/src/simplify_expressions/mod.rs
+++ b/datafusion/optimizer/src/simplify_expressions/mod.rs
@@ -17,6 +17,7 @@
pub mod context;
pub mod expr_simplifier;
+mod regex;
pub mod simplify_exprs;
mod utils;
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs
new file mode 100644
index 000000000..038e1fcce
--- /dev/null
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{BinaryExpr, Expr, Like, Operator};
+use regex_syntax::hir::{Hir, HirKind, Literal};
+
+/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
+const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
+
+pub fn simplify_regex_expr(
+ left: Box<Expr>,
+ op: Operator,
+ right: Box<Expr>,
+) -> Result<Expr, DataFusionError> {
+ let mode = OperatorMode::new(&op);
+
+ if let Expr::Literal(ScalarValue::Utf8(Some(pattern))) = right.as_ref() {
+ match regex_syntax::Parser::new().parse(pattern) {
+ Ok(hir) => {
+ let kind = hir.kind();
+
+ if let HirKind::Alternation(alts) = kind {
+ if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION {
+ if let Some(expr) = lower_alt(&mode, &left, alts) {
+ return Ok(expr);
+ }
+ }
+ } else if let Some(expr) = lower_simple(&mode, &left, &hir) {
+ return Ok(expr);
+ }
+ }
+ Err(e) => {
+ // error out early since the execution may fail anyways
+ return Err(DataFusionError::Context(
+ "Invalid regex".to_owned(),
+ Box::new(DataFusionError::External(Box::new(e))),
+ ));
+ }
+ }
+ }
+
+ // leave untouched if optimization didn't work
+ Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
+}
+
+struct OperatorMode {
+ not: bool,
+ i: bool,
+}
+
+impl OperatorMode {
+ fn new(op: &Operator) -> Self {
+ let not = match op {
+ Operator::RegexMatch | Operator::RegexIMatch => false,
+ Operator::RegexNotMatch | Operator::RegexNotIMatch => true,
+ _ => unreachable!(),
+ };
+
+ let i = match op {
+ Operator::RegexMatch | Operator::RegexNotMatch => false,
+ Operator::RegexIMatch | Operator::RegexNotIMatch => true,
+ _ => unreachable!(),
+ };
+
+ Self { not, i }
+ }
+
+ fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
+ let like = Like {
+ negated: self.not,
+ expr,
+ pattern: Box::new(Expr::Literal(ScalarValue::Utf8(Some(pattern)))),
+ escape_char: None,
+ };
+
+ if self.i {
+ Expr::ILike(like)
+ } else {
+ Expr::Like(like)
+ }
+ }
+}
+
+fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {
+ let mut s = String::with_capacity(parts.len() + 2);
+ s.push('%');
+
+ for sub in parts {
+ if let HirKind::Literal(Literal::Unicode(c)) = sub.kind() {
+ if !is_safe_for_like(*c) {
+ return None;
+ }
+ s.push(*c);
+ } else {
+ return None;
+ }
+ }
+
+ s.push('%');
+ Some(s)
+}
+
+fn is_safe_for_like(c: char) -> bool {
+ (c != '%') && (c != '_')
+}
+
+fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
+ match hir.kind() {
+ HirKind::Empty => {
+ return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
+ }
+ HirKind::Literal(Literal::Unicode(c)) if is_safe_for_like(*c) => {
+ return Some(mode.expr(Box::new(left.clone()), format!("%{c}%")));
+ }
+ HirKind::Concat(inner) => {
+ if let Some(pattern) = collect_concat_to_like_string(inner) {
+ return Some(mode.expr(Box::new(left.clone()), pattern));
+ }
+ }
+ _ => {}
+ }
+
+ None
+}
+
+fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
+ let mut accu: Option<Expr> = None;
+
+ for part in alts {
+ if let Some(expr) = lower_simple(mode, left, part) {
+ accu = match accu {
+ Some(accu) => {
+ if mode.not {
+ Some(accu.and(expr))
+ } else {
+ Some(accu.or(expr))
+ }
+ }
+ None => Some(expr),
+ };
+ } else {
+ return None;
+ }
+ }
+
+ Some(accu.expect("at least two alts"))
+}