You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/05/26 17:20:42 UTC

[arrow-datafusion] branch main updated: feat: eliminate useless join | convert inner to outer when condition is true (#6443)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new a9f0c7afe2 feat: eliminate useless join | convert inner to outer when condition is true (#6443)
a9f0c7afe2 is described below

commit a9f0c7afe2ea3bb14bfcceedfcd33cfe4b954035
Author: jakevin <ja...@gmail.com>
AuthorDate: Sat May 27 01:20:37 2023 +0800

    feat: eliminate useless join | convert inner to outer when condition is true (#6443)
    
    * minor
    
    * feat: eliminate useless join | convert inner to outer when condition is true
---
 datafusion/core/tests/simplification.rs            |   2 +-
 .../tests/sqllogictests/test_files/explain.slt     |   2 +
 .../core/tests/sqllogictests/test_files/join.slt   |  21 ++++
 datafusion/expr/src/field_util.rs                  |   7 +-
 datafusion/expr/src/logical_plan/display.rs        |   6 +-
 datafusion/expr/src/type_coercion/binary.rs        |   9 +-
 datafusion/optimizer/src/eliminate_join.rs         | 121 +++++++++++++++++++++
 datafusion/optimizer/src/lib.rs                    |   1 +
 datafusion/optimizer/src/optimizer.rs              |   2 +
 9 files changed, 152 insertions(+), 19 deletions(-)

diff --git a/datafusion/core/tests/simplification.rs b/datafusion/core/tests/simplification.rs
index 434a0ddbf1..b6d856b2d9 100644
--- a/datafusion/core/tests/simplification.rs
+++ b/datafusion/core/tests/simplification.rs
@@ -27,7 +27,7 @@ use datafusion_optimizer::simplify_expressions::{ExprSimplifier, SimplifyInfo};
 /// about the expressions.
 ///
 /// You can provide that information using DataFusion [DFSchema]
-/// objects or from some other implemention
+/// objects or from some other implementation
 struct MyInfo {
     /// The input schema
     schema: DFSchema,
diff --git a/datafusion/core/tests/sqllogictests/test_files/explain.slt b/datafusion/core/tests/sqllogictests/test_files/explain.slt
index 7656dcea63..75002fecb1 100644
--- a/datafusion/core/tests/sqllogictests/test_files/explain.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/explain.slt
@@ -147,6 +147,7 @@ analyzed_logical_plan SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after unwrap_cast_in_comparison SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
+logical_plan after eliminate_join SAME TEXT AS ABOVE
 logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
 logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
 logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
@@ -175,6 +176,7 @@ logical_plan after push_down_limit SAME TEXT AS ABOVE
 logical_plan after simplify_expressions SAME TEXT AS ABOVE
 logical_plan after unwrap_cast_in_comparison SAME TEXT AS ABOVE
 logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE
+logical_plan after eliminate_join SAME TEXT AS ABOVE
 logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
 logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
 logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
diff --git a/datafusion/core/tests/sqllogictests/test_files/join.slt b/datafusion/core/tests/sqllogictests/test_files/join.slt
index 2f49c5b97c..283ff57a98 100644
--- a/datafusion/core/tests/sqllogictests/test_files/join.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/join.slt
@@ -551,6 +551,27 @@ with t1 as (select 1 as col1, 'asd' as col2),
      t2 as (select 1 as col3, 'sdf' as col4)
 select col2, col4 from t1 full outer join t2 on col1 = col3
 
+# test eliminate join when condition is false
+query TT
+explain select * from t1 join t2 on false;
+----
+logical_plan EmptyRelation
+physical_plan EmptyExec: produce_one_row=false
+
+# test covert inner join to cross join when condition is true
+query TT
+explain select * from t1 inner join t2 on true;
+----
+logical_plan
+CrossJoin:
+--TableScan: t1 projection=[t1_id, t1_name, t1_int]
+--TableScan: t2 projection=[t2_id, t2_name, t2_int]
+physical_plan
+CrossJoinExec
+--CoalescePartitionsExec
+----MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0]
+--MemoryExec: partitions=4, partition_sizes=[1, 0, 0, 0]
+
 statement ok
 drop table IF EXISTS t1;
 
diff --git a/datafusion/expr/src/field_util.rs b/datafusion/expr/src/field_util.rs
index 8f5d6a6e34..feb96928c1 100644
--- a/datafusion/expr/src/field_util.rs
+++ b/datafusion/expr/src/field_util.rs
@@ -37,12 +37,7 @@ pub fn get_indexed_field(data_type: &DataType, key: &ScalarValue) -> Result<Fiel
                 ))
             } else {
                 let field = fields.iter().find(|f| f.name() == s);
-                match field {
-                    None => Err(DataFusionError::Plan(format!(
-                        "Field {s} not found in struct"
-                    ))),
-                    Some(f) => Ok(f.as_ref().clone()),
-                }
+                field.ok_or(DataFusionError::Plan(format!("Field {s} not found in struct"))).map(|f| f.as_ref().clone())
             }
         }
         (DataType::Struct(_), _) => Err(DataFusionError::Plan(
diff --git a/datafusion/expr/src/logical_plan/display.rs b/datafusion/expr/src/logical_plan/display.rs
index 68f5674780..c82689b2cc 100644
--- a/datafusion/expr/src/logical_plan/display.rs
+++ b/datafusion/expr/src/logical_plan/display.rs
@@ -242,10 +242,8 @@ impl<'a, 'b> TreeNodeVisitor for GraphvizVisitor<'a, 'b> {
         // always be non-empty as pre_visit always pushes
         // So it should always be Ok(true)
         let res = self.parent_ids.pop();
-        match res {
-            Some(_) => Ok(VisitRecursion::Continue),
-            None => Err(DataFusionError::Internal("Fail to format".to_string())),
-        }
+        res.ok_or(DataFusionError::Internal("Fail to format".to_string()))
+            .map(|_| VisitRecursion::Continue)
     }
 }
 
diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs
index 962434d652..86c5c15f14 100644
--- a/datafusion/expr/src/type_coercion/binary.rs
+++ b/datafusion/expr/src/type_coercion/binary.rs
@@ -203,14 +203,7 @@ pub fn coerce_types(
     };
 
     // re-write the error message of failed coercions to include the operator's information
-    match result {
-        None => Err(DataFusionError::Plan(
-            format!(
-                "{lhs_type:?} {op} {rhs_type:?} can't be evaluated because there isn't a common type to coerce the types to"
-            ),
-        )),
-        Some(t) => Ok(t)
-    }
+    result.ok_or(DataFusionError::Plan(format!("{lhs_type:?} {op} {rhs_type:?} can't be evaluated because there isn't a common type to coerce the types to")))
 }
 
 /// Coercion rules for mathematics operators between decimal and non-decimal types.
diff --git a/datafusion/optimizer/src/eliminate_join.rs b/datafusion/optimizer/src/eliminate_join.rs
new file mode 100644
index 0000000000..00abcdcc68
--- /dev/null
+++ b/datafusion/optimizer/src/eliminate_join.rs
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::optimizer::ApplyOrder;
+use crate::{OptimizerConfig, OptimizerRule};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::JoinType::Inner;
+use datafusion_expr::{
+    logical_plan::{EmptyRelation, LogicalPlan},
+    CrossJoin, Expr,
+};
+
+/// Eliminates joins when inner join condition is false.
+/// Replaces joins when inner join condition is true with a cross join.
+#[derive(Default)]
+pub struct EliminateJoin;
+
+impl EliminateJoin {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for EliminateJoin {
+    fn try_optimize(
+        &self,
+        plan: &LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> Result<Option<LogicalPlan>> {
+        match plan {
+            LogicalPlan::Join(join) if join.join_type == Inner && join.on.is_empty() => {
+                match join.filter {
+                    Some(Expr::Literal(ScalarValue::Boolean(Some(true)))) => {
+                        Ok(Some(LogicalPlan::CrossJoin(CrossJoin {
+                            left: join.left.clone(),
+                            right: join.right.clone(),
+                            schema: join.schema.clone(),
+                        })))
+                    }
+                    Some(Expr::Literal(ScalarValue::Boolean(Some(false)))) => {
+                        Ok(Some(LogicalPlan::EmptyRelation(EmptyRelation {
+                            produce_one_row: false,
+                            schema: join.schema.clone(),
+                        })))
+                    }
+                    _ => Ok(None),
+                }
+            }
+            _ => Ok(None),
+        }
+    }
+
+    fn name(&self) -> &str {
+        "eliminate_join"
+    }
+
+    fn apply_order(&self) -> Option<ApplyOrder> {
+        Some(ApplyOrder::TopDown)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::eliminate_join::EliminateJoin;
+    use crate::test::*;
+    use datafusion_common::{Column, Result, ScalarValue};
+    use datafusion_expr::JoinType::Inner;
+    use datafusion_expr::{logical_plan::builder::LogicalPlanBuilder, Expr, LogicalPlan};
+    use std::sync::Arc;
+
+    fn assert_optimized_plan_equal(plan: &LogicalPlan, expected: &str) -> Result<()> {
+        assert_optimized_plan_eq(Arc::new(EliminateJoin::new()), plan, expected)
+    }
+
+    #[test]
+    fn join_on_false() -> Result<()> {
+        let plan = LogicalPlanBuilder::empty(false)
+            .join(
+                LogicalPlanBuilder::empty(false).build()?,
+                Inner,
+                (Vec::<Column>::new(), Vec::<Column>::new()),
+                Some(Expr::Literal(ScalarValue::Boolean(Some(false)))),
+            )?
+            .build()?;
+
+        let expected = "EmptyRelation";
+        assert_optimized_plan_equal(&plan, expected)
+    }
+
+    #[test]
+    fn join_on_true() -> Result<()> {
+        let plan = LogicalPlanBuilder::empty(false)
+            .join(
+                LogicalPlanBuilder::empty(false).build()?,
+                Inner,
+                (Vec::<Column>::new(), Vec::<Column>::new()),
+                Some(Expr::Literal(ScalarValue::Boolean(Some(true)))),
+            )?
+            .build()?;
+
+        let expected = "\
+        CrossJoin:\
+        \n  EmptyRelation\
+        \n  EmptyRelation";
+        assert_optimized_plan_equal(&plan, expected)
+    }
+}
diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs
index 2af5edbd9f..b8217e0ac7 100644
--- a/datafusion/optimizer/src/lib.rs
+++ b/datafusion/optimizer/src/lib.rs
@@ -22,6 +22,7 @@ pub mod decorrelate_predicate_subquery;
 pub mod eliminate_cross_join;
 pub mod eliminate_duplicated_expr;
 pub mod eliminate_filter;
+pub mod eliminate_join;
 pub mod eliminate_limit;
 pub mod eliminate_outer_join;
 pub mod eliminate_project;
diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs
index 284280a3a5..f2e6c340d7 100644
--- a/datafusion/optimizer/src/optimizer.rs
+++ b/datafusion/optimizer/src/optimizer.rs
@@ -22,6 +22,7 @@ use crate::decorrelate_predicate_subquery::DecorrelatePredicateSubquery;
 use crate::eliminate_cross_join::EliminateCrossJoin;
 use crate::eliminate_duplicated_expr::EliminateDuplicatedExpr;
 use crate::eliminate_filter::EliminateFilter;
+use crate::eliminate_join::EliminateJoin;
 use crate::eliminate_limit::EliminateLimit;
 use crate::eliminate_outer_join::EliminateOuterJoin;
 use crate::eliminate_project::EliminateProjection;
@@ -210,6 +211,7 @@ impl Optimizer {
             Arc::new(SimplifyExpressions::new()),
             Arc::new(UnwrapCastInComparison::new()),
             Arc::new(ReplaceDistinctWithAggregate::new()),
+            Arc::new(EliminateJoin::new()),
             Arc::new(DecorrelatePredicateSubquery::new()),
             Arc::new(ScalarSubqueryToJoin::new()),
             Arc::new(ExtractEquijoinPredicate::new()),