You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/02/02 14:28:27 UTC
[arrow-datafusion] branch master updated: Add Expression Simplification API (#1717)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new e4a056f Add Expression Simplification API (#1717)
e4a056f is described below
commit e4a056f00c4c5a09bddc1d16b83f771926f7b4a9
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Feb 2 09:28:20 2022 -0500
Add Expression Simplification API (#1717)
* Add Expression Simplification API
* fmt
---
datafusion/src/logical_plan/expr.rs | 68 +++++++++
datafusion/src/logical_plan/mod.rs | 1 +
datafusion/src/optimizer/simplify_expressions.rs | 179 +++++++++++++----------
datafusion/tests/simplification.rs | 106 ++++++++++++++
4 files changed, 273 insertions(+), 81 deletions(-)
diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs
index a1e51e0..63bf72c 100644
--- a/datafusion/src/logical_plan/expr.rs
+++ b/datafusion/src/logical_plan/expr.rs
@@ -20,10 +20,12 @@
pub use super::Operator;
use crate::error::{DataFusionError, Result};
+use crate::execution::context::ExecutionProps;
use crate::field_util::get_indexed_field;
use crate::logical_plan::{
plan::Aggregate, window_frames, DFField, DFSchema, LogicalPlan,
};
+use crate::optimizer::simplify_expressions::{ConstEvaluator, Simplifier};
use crate::physical_plan::functions::Volatility;
use crate::physical_plan::{
aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF,
@@ -971,6 +973,58 @@ impl Expr {
Ok(expr)
}
}
+
+ /// Simplifies this [`Expr`]`s as much as possible, evaluating
+ /// constants and applying algebraic simplifications
+ ///
+ /// # Example:
+ /// `b > 2 AND b > 2`
+ /// can be written to
+ /// `b > 2`
+ ///
+ /// ```
+ /// use datafusion::logical_plan::*;
+ /// use datafusion::error::Result;
+ /// use datafusion::execution::context::ExecutionProps;
+ ///
+ /// /// Simple implementation that provides `Simplifier` the information it needs
+ /// #[derive(Default)]
+ /// struct Info {
+ /// execution_props: ExecutionProps,
+ /// };
+ ///
+ /// impl SimplifyInfo for Info {
+ /// fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+ /// Ok(false)
+ /// }
+ /// fn nullable(&self, expr: &Expr) -> Result<bool> {
+ /// Ok(true)
+ /// }
+ /// fn execution_props(&self) -> &ExecutionProps {
+ /// &self.execution_props
+ /// }
+ /// }
+ ///
+ /// // b < 2
+ /// let b_lt_2 = col("b").gt(lit(2));
+ ///
+ /// // (b < 2) OR (b < 2)
+ /// let expr = b_lt_2.clone().or(b_lt_2.clone());
+ ///
+ /// // (b < 2) OR (b < 2) --> (b < 2)
+ /// let expr = expr.simplify(&Info::default()).unwrap();
+ /// assert_eq!(expr, b_lt_2);
+ /// ```
+ pub fn simplify<S: SimplifyInfo>(self, info: &S) -> Result<Self> {
+ let mut rewriter = Simplifier::new(info);
+ let mut const_evaluator = ConstEvaluator::new(info.execution_props());
+
+ // TODO iterate until no changes are made during rewrite
+ // (evaluating constants can enable new simplifications and
+ // simplifications can enable new constant evaluation)
+ // https://github.com/apache/arrow-datafusion/issues/1160
+ self.rewrite(&mut const_evaluator)?.rewrite(&mut rewriter)
+ }
}
impl Not for Expr {
@@ -1092,6 +1146,20 @@ pub trait ExprRewriter: Sized {
fn mutate(&mut self, expr: Expr) -> Result<Expr>;
}
+/// The information necessary to apply algebraic simplification to an
+/// [Expr]. See [SimplifyContext] for one implementation
+pub trait SimplifyInfo {
+ /// returns true if this Expr has boolean type
+ fn is_boolean_type(&self, expr: &Expr) -> Result<bool>;
+
+ /// returns true of this expr is nullable (could possibly be NULL)
+ fn nullable(&self, expr: &Expr) -> Result<bool>;
+
+ /// Returns details needed for partial expression evaluation
+ fn execution_props(&self) -> &ExecutionProps;
+}
+
+/// Helper struct for building [Expr::Case]
pub struct CaseBuilder {
expr: Option<Box<Expr>>,
when_expr: Vec<Expr>,
diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs
index 06c6bf9..2571451 100644
--- a/datafusion/src/logical_plan/mod.rs
+++ b/datafusion/src/logical_plan/mod.rs
@@ -47,6 +47,7 @@ pub use expr::{
signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, to_hex,
translate, trim, trunc, unalias, unnormalize_col, unnormalize_cols, upper, when,
Column, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, RewriteRecursion,
+ SimplifyInfo,
};
pub use extension::UserDefinedLogicalNode;
pub use operators::Operator;
diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs
index 00739cc..c000bdb 100644
--- a/datafusion/src/optimizer/simplify_expressions.rs
+++ b/datafusion/src/optimizer/simplify_expressions.rs
@@ -23,8 +23,10 @@ use arrow::record_batch::RecordBatch;
use crate::error::DataFusionError;
use crate::execution::context::ExecutionProps;
-use crate::logical_plan::{lit, DFSchemaRef, Expr};
-use crate::logical_plan::{DFSchema, ExprRewriter, LogicalPlan, RewriteRecursion};
+use crate::logical_plan::{
+ lit, DFSchema, DFSchemaRef, Expr, ExprRewriter, LogicalPlan, RewriteRecursion,
+ SimplifyInfo,
+};
use crate::optimizer::optimizer::OptimizerRule;
use crate::optimizer::utils;
use crate::physical_plan::functions::Volatility;
@@ -32,8 +34,57 @@ use crate::physical_plan::planner::create_physical_expr;
use crate::scalar::ScalarValue;
use crate::{error::Result, logical_plan::Operator};
-/// Simplifies plans by rewriting [`Expr`]`s evaluating constants
-/// and applying algebraic simplifications
+/// Provides simplification information based on schema and properties
+struct SimplifyContext<'a, 'b> {
+ schemas: Vec<&'a DFSchemaRef>,
+ props: &'b ExecutionProps,
+}
+
+impl<'a, 'b> SimplifyContext<'a, 'b> {
+ /// Create a new SimplifyContext
+ pub fn new(schemas: Vec<&'a DFSchemaRef>, props: &'b ExecutionProps) -> Self {
+ Self { schemas, props }
+ }
+}
+
+impl<'a, 'b> SimplifyInfo for SimplifyContext<'a, 'b> {
+ /// returns true if this Expr has boolean type
+ fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+ for schema in &self.schemas {
+ if let Ok(DataType::Boolean) = expr.get_type(schema) {
+ return Ok(true);
+ }
+ }
+
+ Ok(false)
+ }
+ /// Returns true if expr is nullable
+ fn nullable(&self, expr: &Expr) -> Result<bool> {
+ self.schemas
+ .iter()
+ .find_map(|schema| {
+ // expr may be from another input, so ignore errors
+ // by converting to None to keep trying
+ expr.nullable(schema.as_ref()).ok()
+ })
+ .ok_or_else(|| {
+ // This means we weren't able to compute `Expr::nullable` with
+ // *any* input schemas, signalling a problem
+ DataFusionError::Internal(format!(
+ "Could not find find columns in '{}' during simplify",
+ expr
+ ))
+ })
+ }
+
+ fn execution_props(&self) -> &ExecutionProps {
+ self.props
+ }
+}
+
+/// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting
+/// [`Expr`]`s evaluating constants and applying algebraic
+/// simplifications
///
/// # Introduction
/// It uses boolean algebra laws to simplify or reduce the number of terms in expressions.
@@ -44,7 +95,7 @@ use crate::{error::Result, logical_plan::Operator};
/// `Filter: b > 2`
///
#[derive(Default)]
-pub struct SimplifyExpressions {}
+pub(crate) struct SimplifyExpressions {}
/// returns true if `needle` is found in a chain of search_op
/// expressions. Such as: (A AND B) AND C
@@ -150,9 +201,7 @@ impl OptimizerRule for SimplifyExpressions {
// projected columns. With just the projected schema, it's not possible to infer types for
// expressions that references non-projected columns within the same project plan or its
// children plans.
- let mut simplifier = Simplifier::new(plan.all_schemas());
-
- let mut const_evaluator = ConstEvaluator::new(execution_props);
+ let info = SimplifyContext::new(plan.all_schemas(), execution_props);
let new_inputs = plan
.inputs()
@@ -168,15 +217,8 @@ impl OptimizerRule for SimplifyExpressions {
// Constant folding should not change expression name.
let name = &e.name(plan.schema());
- // TODO iterate until no changes are made
- // during rewrite (evaluating constants can
- // enable new simplifications and
- // simplifications can enable new constant
- // evaluation)
- let new_e = e
- // fold constants and then simplify
- .rewrite(&mut const_evaluator)?
- .rewrite(&mut simplifier)?;
+ // Apply the actual simplification logic
+ let new_e = e.simplify(&info)?;
let new_name = &new_e.name(plan.schema());
@@ -389,52 +431,23 @@ impl<'a> ConstEvaluator<'a> {
/// * `false = true` and `true = false` to `false`
/// * `!!expr` to `expr`
/// * `expr = null` and `expr != null` to `null`
-pub(crate) struct Simplifier<'a> {
- /// input schemas
- schemas: Vec<&'a DFSchemaRef>,
+pub(crate) struct Simplifier<'a, S> {
+ info: &'a S,
}
-impl<'a> Simplifier<'a> {
- pub fn new(schemas: Vec<&'a DFSchemaRef>) -> Self {
- Self { schemas }
- }
-
- fn is_boolean_type(&self, expr: &Expr) -> bool {
- for schema in &self.schemas {
- if let Ok(DataType::Boolean) = expr.get_type(schema) {
- return true;
- }
- }
-
- false
- }
-
- /// Returns true if expr is nullable
- fn nullable(&self, expr: &Expr) -> Result<bool> {
- self.schemas
- .iter()
- .find_map(|schema| {
- // expr may be from another input, so ignore errors
- // by converting to None to keep trying
- expr.nullable(schema.as_ref()).ok()
- })
- .ok_or_else(|| {
- // This means we weren't able to compute `Expr::nullable` with
- // *any* input schemas, signalling a problem
- DataFusionError::Internal(format!(
- "Could not find find columns in '{}' during simplify",
- expr
- ))
- })
+impl<'a, S> Simplifier<'a, S> {
+ pub fn new(info: &'a S) -> Self {
+ Self { info }
}
}
-impl<'a> ExprRewriter for Simplifier<'a> {
+impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
/// rewrite the expression simplifying any constant expressions
fn mutate(&mut self, expr: Expr) -> Result<Expr> {
use Expr::*;
use Operator::{And, Divide, Eq, Multiply, NotEq, Or};
+ let info = self.info;
let new_expr = match expr {
//
// Rules for Eq
@@ -447,7 +460,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: Eq,
right,
- } if is_bool_lit(&left) && self.is_boolean_type(&right) => {
+ } if is_bool_lit(&left) && info.is_boolean_type(&right)? => {
match as_bool_lit(*left) {
Some(true) => *right,
Some(false) => Not(right),
@@ -461,7 +474,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: Eq,
right,
- } if is_bool_lit(&right) && self.is_boolean_type(&left) => {
+ } if is_bool_lit(&right) && info.is_boolean_type(&left)? => {
match as_bool_lit(*right) {
Some(true) => *left,
Some(false) => Not(left),
@@ -480,7 +493,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: NotEq,
right,
- } if is_bool_lit(&left) && self.is_boolean_type(&right) => {
+ } if is_bool_lit(&left) && info.is_boolean_type(&right)? => {
match as_bool_lit(*left) {
Some(true) => Not(right),
Some(false) => *right,
@@ -494,7 +507,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: NotEq,
right,
- } if is_bool_lit(&right) && self.is_boolean_type(&left) => {
+ } if is_bool_lit(&right) && info.is_boolean_type(&left)? => {
match as_bool_lit(*right) {
Some(true) => Not(left),
Some(false) => *left,
@@ -547,13 +560,13 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: Or,
right,
- } if !self.nullable(&right)? && is_op_with(And, &right, &left) => *left,
+ } if !info.nullable(&right)? && is_op_with(And, &right, &left) => *left,
// (A AND B) OR A --> A (if B not null)
BinaryExpr {
left,
op: Or,
right,
- } if !self.nullable(&left)? && is_op_with(And, &left, &right) => *right,
+ } if !info.nullable(&left)? && is_op_with(And, &left, &right) => *right,
//
// Rules for AND
@@ -600,13 +613,13 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: And,
right,
- } if !self.nullable(&right)? && is_op_with(Or, &right, &left) => *left,
+ } if !info.nullable(&right)? && is_op_with(Or, &right, &left) => *left,
// (A OR B) AND A --> A (if B not null)
BinaryExpr {
left,
op: And,
right,
- } if !self.nullable(&left)? && is_op_with(Or, &left, &right) => *right,
+ } if !info.nullable(&left)? && is_op_with(Or, &left, &right) => *right,
//
// Rules for Multiply
@@ -643,7 +656,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
left,
op: Divide,
right,
- } if !self.nullable(&left)? && left == right => lit(1),
+ } if !info.nullable(&left)? && left == right => lit(1),
//
// Rules for Not
@@ -676,7 +689,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
else_expr,
} if !when_then_expr.is_empty()
&& when_then_expr.len() < 3 // The rewrite is O(n!) so limit to small number
- && self.is_boolean_type(&when_then_expr[0].1) =>
+ && info.is_boolean_type(&when_then_expr[0].1)? =>
{
// The disjunction of all the when predicates encountered so far
let mut filter_expr = lit(false);
@@ -1208,17 +1221,9 @@ mod tests {
fn simplify(expr: Expr) -> Expr {
let schema = expr_test_schema();
- let mut rewriter = Simplifier::new(vec![&schema]);
-
let execution_props = ExecutionProps::new();
- let mut const_evaluator = ConstEvaluator::new(&execution_props);
-
- expr.rewrite(&mut rewriter)
- .expect("expected to simplify")
- .rewrite(&mut const_evaluator)
- .expect("expected to const evaluate")
- .rewrite(&mut rewriter)
- .expect("expected to simplify")
+ let info = SimplifyContext::new(vec![&schema], &execution_props);
+ expr.simplify(&info).unwrap()
}
fn expr_test_schema() -> DFSchemaRef {
@@ -1357,30 +1362,36 @@ mod tests {
// CASE WHERE c2 THEN true ELSE c2
// -->
// c2
+ //
+ // Need to call simplify 2x due to
+ // https://github.com/apache/arrow-datafusion/issues/1160
assert_eq!(
- simplify(Expr::Case {
+ simplify(simplify(Expr::Case {
expr: None,
when_then_expr: vec![(
Box::new(col("c2").not_eq(lit(false))),
Box::new(lit("ok").eq(lit("ok"))),
)],
else_expr: Some(Box::new(col("c2").eq(lit(true)))),
- }),
+ })),
col("c2").or(col("c2").not().and(col("c2"))) // #1716
);
// CASE WHERE ISNULL(c2) THEN true ELSE c2
// -->
// ISNULL(c2) OR c2
+ //
+ // Need to call simplify 2x due to
+ // https://github.com/apache/arrow-datafusion/issues/1160
assert_eq!(
- simplify(Expr::Case {
+ simplify(simplify(Expr::Case {
expr: None,
when_then_expr: vec![(
Box::new(col("c2").is_null()),
Box::new(lit(true)),
)],
else_expr: Some(Box::new(col("c2"))),
- }),
+ })),
col("c2")
.is_null()
.or(col("c2").is_null().not().and(col("c2")))
@@ -1390,15 +1401,18 @@ mod tests {
// --> c1 OR (NOT(c1) AND c2 AND FALSE) OR (NOT(c1 OR c2) AND TRUE)
// --> c1 OR (NOT(c1 OR c2))
// --> NOT(c1) AND c2
+ //
+ // Need to call simplify 2x due to
+ // https://github.com/apache/arrow-datafusion/issues/1160
assert_eq!(
- simplify(Expr::Case {
+ simplify(simplify(Expr::Case {
expr: None,
when_then_expr: vec![
(Box::new(col("c1")), Box::new(lit(true)),),
(Box::new(col("c2")), Box::new(lit(false)),)
],
else_expr: Some(Box::new(lit(true))),
- }),
+ })),
col("c1").or(col("c1").or(col("c2")).not())
);
@@ -1406,15 +1420,18 @@ mod tests {
// --> c1 OR (NOT(c1) AND c2 AND TRUE) OR (NOT(c1 OR c2) AND FALSE)
// --> c1 OR (NOT(c1) AND c2)
// --> c1 OR c2
+ //
+ // Need to call simplify 2x due to
+ // https://github.com/apache/arrow-datafusion/issues/1160
assert_eq!(
- simplify(Expr::Case {
+ simplify(simplify(Expr::Case {
expr: None,
when_then_expr: vec![
(Box::new(col("c1")), Box::new(lit(true)),),
(Box::new(col("c2")), Box::new(lit(false)),)
],
else_expr: Some(Box::new(lit(true))),
- }),
+ })),
col("c1").or(col("c1").or(col("c2")).not())
);
}
diff --git a/datafusion/tests/simplification.rs b/datafusion/tests/simplification.rs
new file mode 100644
index 0000000..5edf43f
--- /dev/null
+++ b/datafusion/tests/simplification.rs
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This program demonstrates the DataFusion expression simplification API.
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::{
+ error::Result,
+ execution::context::ExecutionProps,
+ logical_plan::{DFSchema, Expr, SimplifyInfo},
+ prelude::*,
+};
+
+/// In order to simplify expressions, DataFusion must have information
+/// about the expressions.
+///
+/// You can provide that information using DataFusion [DFSchema]
+/// objects or from some other implemention
+struct MyInfo {
+ /// The input schema
+ schema: DFSchema,
+
+ /// Execution specific details needed for constant evaluation such
+ /// as the current time for `now()` and [VariableProviders]
+ execution_props: ExecutionProps,
+}
+
+impl SimplifyInfo for MyInfo {
+ fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+ Ok(matches!(expr.get_type(&self.schema)?, DataType::Boolean))
+ }
+
+ fn nullable(&self, expr: &Expr) -> Result<bool> {
+ expr.nullable(&self.schema)
+ }
+
+ fn execution_props(&self) -> &ExecutionProps {
+ &self.execution_props
+ }
+}
+
+impl From<DFSchema> for MyInfo {
+ fn from(schema: DFSchema) -> Self {
+ Self {
+ schema,
+ execution_props: ExecutionProps::new(),
+ }
+ }
+}
+
+/// A schema like:
+///
+/// a: Int32 (possibly with nulls)
+/// b: Int32
+/// s: Utf8
+fn schema() -> DFSchema {
+ Schema::new(vec![
+ Field::new("a", DataType::Int32, true),
+ Field::new("b", DataType::Int32, false),
+ Field::new("s", DataType::Utf8, false),
+ ])
+ .try_into()
+ .unwrap()
+}
+
+#[test]
+fn basic() {
+ let info: MyInfo = schema().into();
+
+ // The `Expr` is a core concept in DataFusion, and DataFusion can
+ // help simplify it.
+
+ // For example 'a < (2 + 3)' can be rewritten into the easier to
+ // optimize form `a < 5` automatically
+ let expr = col("a").lt(lit(2i32) + lit(3i32));
+
+ let simplified = expr.simplify(&info).unwrap();
+ assert_eq!(simplified, col("a").lt(lit(5i32)));
+}
+
+#[test]
+fn fold_and_simplify() {
+ let info: MyInfo = schema().into();
+
+ // What will it do with the expression `concat('foo', 'bar') == 'foobar')`?
+ let expr = concat(&[lit("foo"), lit("bar")]).eq(lit("foobar"));
+
+ // Since datafusion applies both simplification *and* rewriting
+ // some expressions can be entirely simplified
+ let simplified = expr.simplify(&info).unwrap();
+ assert_eq!(simplified, lit(true))
+}