You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/02/02 14:28:27 UTC

[arrow-datafusion] branch master updated: Add Expression Simplification API (#1717)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new e4a056f  Add Expression Simplification API (#1717)
e4a056f is described below

commit e4a056f00c4c5a09bddc1d16b83f771926f7b4a9
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Feb 2 09:28:20 2022 -0500

    Add Expression Simplification API (#1717)
    
    * Add Expression Simplification API
    
    * fmt
---
 datafusion/src/logical_plan/expr.rs              |  68 +++++++++
 datafusion/src/logical_plan/mod.rs               |   1 +
 datafusion/src/optimizer/simplify_expressions.rs | 179 +++++++++++++----------
 datafusion/tests/simplification.rs               | 106 ++++++++++++++
 4 files changed, 273 insertions(+), 81 deletions(-)

diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs
index a1e51e0..63bf72c 100644
--- a/datafusion/src/logical_plan/expr.rs
+++ b/datafusion/src/logical_plan/expr.rs
@@ -20,10 +20,12 @@
 
 pub use super::Operator;
 use crate::error::{DataFusionError, Result};
+use crate::execution::context::ExecutionProps;
 use crate::field_util::get_indexed_field;
 use crate::logical_plan::{
     plan::Aggregate, window_frames, DFField, DFSchema, LogicalPlan,
 };
+use crate::optimizer::simplify_expressions::{ConstEvaluator, Simplifier};
 use crate::physical_plan::functions::Volatility;
 use crate::physical_plan::{
     aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF,
@@ -971,6 +973,58 @@ impl Expr {
             Ok(expr)
         }
     }
+
+    /// Simplifies this [`Expr`]`s as much as possible, evaluating
+    /// constants and applying algebraic simplifications
+    ///
+    /// # Example:
+    /// `b > 2 AND b > 2`
+    /// can be written to
+    /// `b > 2`
+    ///
+    /// ```
+    /// use datafusion::logical_plan::*;
+    /// use datafusion::error::Result;
+    /// use datafusion::execution::context::ExecutionProps;
+    ///
+    /// /// Simple implementation that provides `Simplifier` the information it needs
+    /// #[derive(Default)]
+    /// struct Info {
+    ///   execution_props: ExecutionProps,
+    /// };
+    ///
+    /// impl SimplifyInfo for Info {
+    ///   fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+    ///     Ok(false)
+    ///   }
+    ///   fn nullable(&self, expr: &Expr) -> Result<bool> {
+    ///     Ok(true)
+    ///   }
+    ///   fn execution_props(&self) -> &ExecutionProps {
+    ///     &self.execution_props
+    ///   }
+    /// }
+    ///
+    /// // b < 2
+    /// let b_lt_2 = col("b").gt(lit(2));
+    ///
+    /// // (b < 2) OR (b < 2)
+    /// let expr = b_lt_2.clone().or(b_lt_2.clone());
+    ///
+    /// // (b < 2) OR (b < 2) --> (b < 2)
+    /// let expr = expr.simplify(&Info::default()).unwrap();
+    /// assert_eq!(expr, b_lt_2);
+    /// ```
+    pub fn simplify<S: SimplifyInfo>(self, info: &S) -> Result<Self> {
+        let mut rewriter = Simplifier::new(info);
+        let mut const_evaluator = ConstEvaluator::new(info.execution_props());
+
+        // TODO iterate until no changes are made during rewrite
+        // (evaluating constants can enable new simplifications and
+        // simplifications can enable new constant evaluation)
+        // https://github.com/apache/arrow-datafusion/issues/1160
+        self.rewrite(&mut const_evaluator)?.rewrite(&mut rewriter)
+    }
 }
 
 impl Not for Expr {
@@ -1092,6 +1146,20 @@ pub trait ExprRewriter: Sized {
     fn mutate(&mut self, expr: Expr) -> Result<Expr>;
 }
 
+/// The information necessary to apply algebraic simplification to an
+/// [Expr]. See [SimplifyContext] for one implementation
+pub trait SimplifyInfo {
+    /// returns true if this Expr has boolean type
+    fn is_boolean_type(&self, expr: &Expr) -> Result<bool>;
+
+    /// returns true of this expr is nullable (could possibly be NULL)
+    fn nullable(&self, expr: &Expr) -> Result<bool>;
+
+    /// Returns details needed for partial expression evaluation
+    fn execution_props(&self) -> &ExecutionProps;
+}
+
+/// Helper struct for building [Expr::Case]
 pub struct CaseBuilder {
     expr: Option<Box<Expr>>,
     when_expr: Vec<Expr>,
diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs
index 06c6bf9..2571451 100644
--- a/datafusion/src/logical_plan/mod.rs
+++ b/datafusion/src/logical_plan/mod.rs
@@ -47,6 +47,7 @@ pub use expr::{
     signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, to_hex,
     translate, trim, trunc, unalias, unnormalize_col, unnormalize_cols, upper, when,
     Column, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, RewriteRecursion,
+    SimplifyInfo,
 };
 pub use extension::UserDefinedLogicalNode;
 pub use operators::Operator;
diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs
index 00739cc..c000bdb 100644
--- a/datafusion/src/optimizer/simplify_expressions.rs
+++ b/datafusion/src/optimizer/simplify_expressions.rs
@@ -23,8 +23,10 @@ use arrow::record_batch::RecordBatch;
 
 use crate::error::DataFusionError;
 use crate::execution::context::ExecutionProps;
-use crate::logical_plan::{lit, DFSchemaRef, Expr};
-use crate::logical_plan::{DFSchema, ExprRewriter, LogicalPlan, RewriteRecursion};
+use crate::logical_plan::{
+    lit, DFSchema, DFSchemaRef, Expr, ExprRewriter, LogicalPlan, RewriteRecursion,
+    SimplifyInfo,
+};
 use crate::optimizer::optimizer::OptimizerRule;
 use crate::optimizer::utils;
 use crate::physical_plan::functions::Volatility;
@@ -32,8 +34,57 @@ use crate::physical_plan::planner::create_physical_expr;
 use crate::scalar::ScalarValue;
 use crate::{error::Result, logical_plan::Operator};
 
-/// Simplifies plans by rewriting [`Expr`]`s evaluating constants
-/// and applying algebraic simplifications
+/// Provides simplification information based on schema and properties
+struct SimplifyContext<'a, 'b> {
+    schemas: Vec<&'a DFSchemaRef>,
+    props: &'b ExecutionProps,
+}
+
+impl<'a, 'b> SimplifyContext<'a, 'b> {
+    /// Create a new SimplifyContext
+    pub fn new(schemas: Vec<&'a DFSchemaRef>, props: &'b ExecutionProps) -> Self {
+        Self { schemas, props }
+    }
+}
+
+impl<'a, 'b> SimplifyInfo for SimplifyContext<'a, 'b> {
+    /// returns true if this Expr has boolean type
+    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+        for schema in &self.schemas {
+            if let Ok(DataType::Boolean) = expr.get_type(schema) {
+                return Ok(true);
+            }
+        }
+
+        Ok(false)
+    }
+    /// Returns true if expr is nullable
+    fn nullable(&self, expr: &Expr) -> Result<bool> {
+        self.schemas
+            .iter()
+            .find_map(|schema| {
+                // expr may be from another input, so ignore errors
+                // by converting to None to keep trying
+                expr.nullable(schema.as_ref()).ok()
+            })
+            .ok_or_else(|| {
+                // This means we weren't able to compute `Expr::nullable` with
+                // *any* input schemas, signalling a problem
+                DataFusionError::Internal(format!(
+                    "Could not find find columns in '{}' during simplify",
+                    expr
+                ))
+            })
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        self.props
+    }
+}
+
+/// Optimizer Pass that simplifies [`LogicalPlan`]s by rewriting
+/// [`Expr`]`s evaluating constants and applying algebraic
+/// simplifications
 ///
 /// # Introduction
 /// It uses boolean algebra laws to simplify or reduce the number of terms in expressions.
@@ -44,7 +95,7 @@ use crate::{error::Result, logical_plan::Operator};
 /// `Filter: b > 2`
 ///
 #[derive(Default)]
-pub struct SimplifyExpressions {}
+pub(crate) struct SimplifyExpressions {}
 
 /// returns true if `needle` is found in a chain of search_op
 /// expressions. Such as: (A AND B) AND C
@@ -150,9 +201,7 @@ impl OptimizerRule for SimplifyExpressions {
         // projected columns. With just the projected schema, it's not possible to infer types for
         // expressions that references non-projected columns within the same project plan or its
         // children plans.
-        let mut simplifier = Simplifier::new(plan.all_schemas());
-
-        let mut const_evaluator = ConstEvaluator::new(execution_props);
+        let info = SimplifyContext::new(plan.all_schemas(), execution_props);
 
         let new_inputs = plan
             .inputs()
@@ -168,15 +217,8 @@ impl OptimizerRule for SimplifyExpressions {
                 // Constant folding should not change expression name.
                 let name = &e.name(plan.schema());
 
-                // TODO iterate until no changes are made
-                // during rewrite (evaluating constants can
-                // enable new simplifications and
-                // simplifications can enable new constant
-                // evaluation)
-                let new_e = e
-                    // fold constants and then simplify
-                    .rewrite(&mut const_evaluator)?
-                    .rewrite(&mut simplifier)?;
+                // Apply the actual simplification logic
+                let new_e = e.simplify(&info)?;
 
                 let new_name = &new_e.name(plan.schema());
 
@@ -389,52 +431,23 @@ impl<'a> ConstEvaluator<'a> {
 /// * `false = true` and `true = false` to `false`
 /// * `!!expr` to `expr`
 /// * `expr = null` and `expr != null` to `null`
-pub(crate) struct Simplifier<'a> {
-    /// input schemas
-    schemas: Vec<&'a DFSchemaRef>,
+pub(crate) struct Simplifier<'a, S> {
+    info: &'a S,
 }
 
-impl<'a> Simplifier<'a> {
-    pub fn new(schemas: Vec<&'a DFSchemaRef>) -> Self {
-        Self { schemas }
-    }
-
-    fn is_boolean_type(&self, expr: &Expr) -> bool {
-        for schema in &self.schemas {
-            if let Ok(DataType::Boolean) = expr.get_type(schema) {
-                return true;
-            }
-        }
-
-        false
-    }
-
-    /// Returns true if expr is nullable
-    fn nullable(&self, expr: &Expr) -> Result<bool> {
-        self.schemas
-            .iter()
-            .find_map(|schema| {
-                // expr may be from another input, so ignore errors
-                // by converting to None to keep trying
-                expr.nullable(schema.as_ref()).ok()
-            })
-            .ok_or_else(|| {
-                // This means we weren't able to compute `Expr::nullable` with
-                // *any* input schemas, signalling a problem
-                DataFusionError::Internal(format!(
-                    "Could not find find columns in '{}' during simplify",
-                    expr
-                ))
-            })
+impl<'a, S> Simplifier<'a, S> {
+    pub fn new(info: &'a S) -> Self {
+        Self { info }
     }
 }
 
-impl<'a> ExprRewriter for Simplifier<'a> {
+impl<'a, S: SimplifyInfo> ExprRewriter for Simplifier<'a, S> {
     /// rewrite the expression simplifying any constant expressions
     fn mutate(&mut self, expr: Expr) -> Result<Expr> {
         use Expr::*;
         use Operator::{And, Divide, Eq, Multiply, NotEq, Or};
 
+        let info = self.info;
         let new_expr = match expr {
             //
             // Rules for Eq
@@ -447,7 +460,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: Eq,
                 right,
-            } if is_bool_lit(&left) && self.is_boolean_type(&right) => {
+            } if is_bool_lit(&left) && info.is_boolean_type(&right)? => {
                 match as_bool_lit(*left) {
                     Some(true) => *right,
                     Some(false) => Not(right),
@@ -461,7 +474,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: Eq,
                 right,
-            } if is_bool_lit(&right) && self.is_boolean_type(&left) => {
+            } if is_bool_lit(&right) && info.is_boolean_type(&left)? => {
                 match as_bool_lit(*right) {
                     Some(true) => *left,
                     Some(false) => Not(left),
@@ -480,7 +493,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: NotEq,
                 right,
-            } if is_bool_lit(&left) && self.is_boolean_type(&right) => {
+            } if is_bool_lit(&left) && info.is_boolean_type(&right)? => {
                 match as_bool_lit(*left) {
                     Some(true) => Not(right),
                     Some(false) => *right,
@@ -494,7 +507,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: NotEq,
                 right,
-            } if is_bool_lit(&right) && self.is_boolean_type(&left) => {
+            } if is_bool_lit(&right) && info.is_boolean_type(&left)? => {
                 match as_bool_lit(*right) {
                     Some(true) => Not(left),
                     Some(false) => *left,
@@ -547,13 +560,13 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: Or,
                 right,
-            } if !self.nullable(&right)? && is_op_with(And, &right, &left) => *left,
+            } if !info.nullable(&right)? && is_op_with(And, &right, &left) => *left,
             // (A AND B) OR A --> A (if B not null)
             BinaryExpr {
                 left,
                 op: Or,
                 right,
-            } if !self.nullable(&left)? && is_op_with(And, &left, &right) => *right,
+            } if !info.nullable(&left)? && is_op_with(And, &left, &right) => *right,
 
             //
             // Rules for AND
@@ -600,13 +613,13 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: And,
                 right,
-            } if !self.nullable(&right)? && is_op_with(Or, &right, &left) => *left,
+            } if !info.nullable(&right)? && is_op_with(Or, &right, &left) => *left,
             // (A OR B) AND A --> A (if B not null)
             BinaryExpr {
                 left,
                 op: And,
                 right,
-            } if !self.nullable(&left)? && is_op_with(Or, &left, &right) => *right,
+            } if !info.nullable(&left)? && is_op_with(Or, &left, &right) => *right,
 
             //
             // Rules for Multiply
@@ -643,7 +656,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 left,
                 op: Divide,
                 right,
-            } if !self.nullable(&left)? && left == right => lit(1),
+            } if !info.nullable(&left)? && left == right => lit(1),
 
             //
             // Rules for Not
@@ -676,7 +689,7 @@ impl<'a> ExprRewriter for Simplifier<'a> {
                 else_expr,
             } if !when_then_expr.is_empty()
                 && when_then_expr.len() < 3 // The rewrite is O(n!) so limit to small number
-                && self.is_boolean_type(&when_then_expr[0].1) =>
+                && info.is_boolean_type(&when_then_expr[0].1)? =>
             {
                 // The disjunction of all the when predicates encountered so far
                 let mut filter_expr = lit(false);
@@ -1208,17 +1221,9 @@ mod tests {
 
     fn simplify(expr: Expr) -> Expr {
         let schema = expr_test_schema();
-        let mut rewriter = Simplifier::new(vec![&schema]);
-
         let execution_props = ExecutionProps::new();
-        let mut const_evaluator = ConstEvaluator::new(&execution_props);
-
-        expr.rewrite(&mut rewriter)
-            .expect("expected to simplify")
-            .rewrite(&mut const_evaluator)
-            .expect("expected to const evaluate")
-            .rewrite(&mut rewriter)
-            .expect("expected to simplify")
+        let info = SimplifyContext::new(vec![&schema], &execution_props);
+        expr.simplify(&info).unwrap()
     }
 
     fn expr_test_schema() -> DFSchemaRef {
@@ -1357,30 +1362,36 @@ mod tests {
         // CASE WHERE c2 THEN true ELSE c2
         // -->
         // c2
+        //
+        // Need to call simplify 2x due to
+        // https://github.com/apache/arrow-datafusion/issues/1160
         assert_eq!(
-            simplify(Expr::Case {
+            simplify(simplify(Expr::Case {
                 expr: None,
                 when_then_expr: vec![(
                     Box::new(col("c2").not_eq(lit(false))),
                     Box::new(lit("ok").eq(lit("ok"))),
                 )],
                 else_expr: Some(Box::new(col("c2").eq(lit(true)))),
-            }),
+            })),
             col("c2").or(col("c2").not().and(col("c2"))) // #1716
         );
 
         // CASE WHERE ISNULL(c2) THEN true ELSE c2
         // -->
         // ISNULL(c2) OR c2
+        //
+        // Need to call simplify 2x due to
+        // https://github.com/apache/arrow-datafusion/issues/1160
         assert_eq!(
-            simplify(Expr::Case {
+            simplify(simplify(Expr::Case {
                 expr: None,
                 when_then_expr: vec![(
                     Box::new(col("c2").is_null()),
                     Box::new(lit(true)),
                 )],
                 else_expr: Some(Box::new(col("c2"))),
-            }),
+            })),
             col("c2")
                 .is_null()
                 .or(col("c2").is_null().not().and(col("c2")))
@@ -1390,15 +1401,18 @@ mod tests {
         // --> c1 OR (NOT(c1) AND c2 AND FALSE) OR (NOT(c1 OR c2) AND TRUE)
         // --> c1 OR (NOT(c1 OR c2))
         // --> NOT(c1) AND c2
+        //
+        // Need to call simplify 2x due to
+        // https://github.com/apache/arrow-datafusion/issues/1160
         assert_eq!(
-            simplify(Expr::Case {
+            simplify(simplify(Expr::Case {
                 expr: None,
                 when_then_expr: vec![
                     (Box::new(col("c1")), Box::new(lit(true)),),
                     (Box::new(col("c2")), Box::new(lit(false)),)
                 ],
                 else_expr: Some(Box::new(lit(true))),
-            }),
+            })),
             col("c1").or(col("c1").or(col("c2")).not())
         );
 
@@ -1406,15 +1420,18 @@ mod tests {
         // --> c1 OR (NOT(c1) AND c2 AND TRUE) OR (NOT(c1 OR c2) AND FALSE)
         // --> c1 OR (NOT(c1) AND c2)
         // --> c1 OR c2
+        //
+        // Need to call simplify 2x due to
+        // https://github.com/apache/arrow-datafusion/issues/1160
         assert_eq!(
-            simplify(Expr::Case {
+            simplify(simplify(Expr::Case {
                 expr: None,
                 when_then_expr: vec![
                     (Box::new(col("c1")), Box::new(lit(true)),),
                     (Box::new(col("c2")), Box::new(lit(false)),)
                 ],
                 else_expr: Some(Box::new(lit(true))),
-            }),
+            })),
             col("c1").or(col("c1").or(col("c2")).not())
         );
     }
diff --git a/datafusion/tests/simplification.rs b/datafusion/tests/simplification.rs
new file mode 100644
index 0000000..5edf43f
--- /dev/null
+++ b/datafusion/tests/simplification.rs
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This program demonstrates the DataFusion expression simplification API.
+
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion::{
+    error::Result,
+    execution::context::ExecutionProps,
+    logical_plan::{DFSchema, Expr, SimplifyInfo},
+    prelude::*,
+};
+
+/// In order to simplify expressions, DataFusion must have information
+/// about the expressions.
+///
+/// You can provide that information using DataFusion [DFSchema]
+/// objects or from some other implemention
+struct MyInfo {
+    /// The input schema
+    schema: DFSchema,
+
+    /// Execution specific details needed for constant evaluation such
+    /// as the current time for `now()` and [VariableProviders]
+    execution_props: ExecutionProps,
+}
+
+impl SimplifyInfo for MyInfo {
+    fn is_boolean_type(&self, expr: &Expr) -> Result<bool> {
+        Ok(matches!(expr.get_type(&self.schema)?, DataType::Boolean))
+    }
+
+    fn nullable(&self, expr: &Expr) -> Result<bool> {
+        expr.nullable(&self.schema)
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        &self.execution_props
+    }
+}
+
+impl From<DFSchema> for MyInfo {
+    fn from(schema: DFSchema) -> Self {
+        Self {
+            schema,
+            execution_props: ExecutionProps::new(),
+        }
+    }
+}
+
+/// A schema like:
+///
+/// a: Int32 (possibly with nulls)
+/// b: Int32
+/// s: Utf8
+fn schema() -> DFSchema {
+    Schema::new(vec![
+        Field::new("a", DataType::Int32, true),
+        Field::new("b", DataType::Int32, false),
+        Field::new("s", DataType::Utf8, false),
+    ])
+    .try_into()
+    .unwrap()
+}
+
+#[test]
+fn basic() {
+    let info: MyInfo = schema().into();
+
+    // The `Expr` is a core concept in DataFusion, and DataFusion can
+    // help simplify it.
+
+    // For example 'a < (2 + 3)' can be rewritten into the easier to
+    // optimize form `a < 5` automatically
+    let expr = col("a").lt(lit(2i32) + lit(3i32));
+
+    let simplified = expr.simplify(&info).unwrap();
+    assert_eq!(simplified, col("a").lt(lit(5i32)));
+}
+
+#[test]
+fn fold_and_simplify() {
+    let info: MyInfo = schema().into();
+
+    // What will it do with the expression `concat('foo', 'bar') == 'foobar')`?
+    let expr = concat(&[lit("foo"), lit("bar")]).eq(lit("foobar"));
+
+    // Since datafusion applies both simplification *and* rewriting
+    // some expressions can be entirely simplified
+    let simplified = expr.simplify(&info).unwrap();
+    assert_eq!(simplified, lit(true))
+}