You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/03/28 19:57:43 UTC

[GitHub] [arrow-datafusion] alamb commented on a change in pull request #2054: add some pre-check for LogicalPlanBuilder

alamb commented on a change in pull request #2054:
URL: https://github.com/apache/arrow-datafusion/pull/2054#discussion_r836793959



##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1162,6 +1164,117 @@ pub(crate) fn expand_qualified_wildcard(
     expand_wildcard(&qualifier_schema, plan)
 }
 
+/// check whether the logical plan we are building is valid
+fn check_plan_invalid(plan: &LogicalPlan) -> Result<()> {
+    match plan {
+        LogicalPlan::Projection(Projection { expr, input, .. })
+        | LogicalPlan::Sort(Sort { expr, input })
+        | LogicalPlan::Window(Window {
+            window_expr: expr,
+            input,
+            ..
+        }) => check_plan_invalid(input)
+            .and_then(|_| check_any_invalid_expr(expr, input.schema())),
+
+        LogicalPlan::Filter(Filter {
+            predicate: expr,
+            input,
+        }) => check_plan_invalid(input)
+            .and_then(|_| check_invalid_expr(expr, input.schema())),
+
+        LogicalPlan::Aggregate(Aggregate {
+            input,
+            group_expr,
+            aggr_expr,
+            ..
+        }) => {
+            let schema = input.schema();
+            check_plan_invalid(input)
+                .and_then(|_| check_any_invalid_expr(group_expr, schema))
+                .and_then(|_| check_any_invalid_expr(aggr_expr, schema))
+        }
+
+        LogicalPlan::Join(Join { left, right, .. })
+        | LogicalPlan::CrossJoin(CrossJoin { left, right, .. }) => {
+            check_plan_invalid(left).and_then(|_| check_plan_invalid(right))
+        }
+
+        LogicalPlan::Repartition(Repartition { input, .. })
+        | LogicalPlan::Limit(Limit { input, .. })
+        | LogicalPlan::Explain(Explain { plan: input, .. })
+        | LogicalPlan::Analyze(Analyze { input, .. }) => check_plan_invalid(input),
+
+        LogicalPlan::Union(Union { inputs, .. }) => {
+            inputs.iter().try_for_each(check_plan_invalid)
+        }
+
+        LogicalPlan::TableScan(TableScan {
+            table_name: _,
+            source,
+            projection,
+            projected_schema,
+            filters,
+            limit: _,
+        }) => {
+            if let Some(projection) = projection {
+                if projection.len() > projected_schema.fields().len() {
+                    return Err(DataFusionError::Plan(
+                        "projection contains columns that doesnt belong to projected schema"
+                            .to_owned(),
+                    ));
+                }
+            }
+            let schema = &source.schema().to_dfschema_ref()?;
+            check_any_invalid_expr(filters, schema)
+        }
+        _ => Ok(()),
+    }
+}
+
+/// find first error in the exprs
+fn check_any_invalid_expr(exprs: &[Expr], schema: &DFSchemaRef) -> Result<()> {
+    exprs.iter().try_for_each(|e| check_invalid_expr(e, schema))
+}
+
+/// do some checks for exprs in a logical plan
+fn check_invalid_expr(expr: &Expr, schema: &DFSchemaRef) -> Result<()> {

Review comment:
       I think I may have missed it, but as written I don't think this recurses into the `expr` -- so while it will find nonsense like
   
   ```
   NOT "foo"
   ```
   
   I don't think it would find
   ```
   A = 5 OR (NOT "foo")
   ```
   
   We could use an `ExpressionVisitor` for thishttps://github.com/apache/arrow-datafusion/blob/634252b44cec8dc904e48926d0aa54feeb4d48af/datafusion/src/logical_plan/expr_visitor.rs#L32-L47
   
   Here is an example of using `ExpressionVisitor` for something similar: 
   https://github.com/apache/arrow-datafusion/blob/2d6addd4c435123e934ce04564d8cc77bf101c37/datafusion/src/datasource/listing/helpers.rs#L55-L119

##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1395,4 +1508,58 @@ mod tests {
         assert!(stringified_plan.should_display(true));
         assert!(!stringified_plan.should_display(false));
     }
+
+    #[test]
+    fn invalid_plan_with_not_non_boolean_expression() -> Result<()> {
+        let mut plan = LogicalPlanBuilder::scan_empty(
+            Some("employee_csv"),
+            &employee_schema(),
+            Some(vec![0, 1, 2, 3, 4]),
+        )?;
+        plan = plan.project(vec![Expr::Not(Box::new(Expr::Column(Column {
+            relation: None,
+            name: "id".to_string(),
+        })))])?;

Review comment:
       I think the "builder" API for expressions makes for much easier reading:
   
   ```suggestion
           plan = plan.project(vec![col("id").not()])?;
   ```
   
   

##########
File path: datafusion/src/sql/planner.rs
##########
@@ -2347,9 +2347,9 @@ mod tests {
     #[test]
     fn select_neg_filter() {
         let sql = "SELECT id, first_name, last_name \

Review comment:
       👍 

##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1162,6 +1164,117 @@ pub(crate) fn expand_qualified_wildcard(
     expand_wildcard(&qualifier_schema, plan)
 }
 
+/// check whether the logical plan we are building is valid

Review comment:
       As a follow on PR, it would be awesome to encapsulate this `LogicalPlan` tree walking code into some sort of visitor. 

##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1395,4 +1508,58 @@ mod tests {
         assert!(stringified_plan.should_display(true));
         assert!(!stringified_plan.should_display(false));
     }
+
+    #[test]
+    fn invalid_plan_with_not_non_boolean_expression() -> Result<()> {
+        let mut plan = LogicalPlanBuilder::scan_empty(
+            Some("employee_csv"),
+            &employee_schema(),
+            Some(vec![0, 1, 2, 3, 4]),
+        )?;
+        plan = plan.project(vec![Expr::Not(Box::new(Expr::Column(Column {
+            relation: None,
+            name: "id".to_string(),
+        })))])?;
+        match plan.build() {
+            Ok(_) => panic!("Unexpect Ok"),
+            Err(e) => {
+                assert_eq!(
+                    e.to_string(),
+                    "Error during planning: \
+                    Invalid Not Expression(NOT #employee_csv.id) -- \
+                    Type(Int32) of expression(#employee_csv.id) must be of a boolean type"
+                )
+            }
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn invalid_plan_cast_utf8_boolean() -> Result<()> {
+        let mut plan = LogicalPlanBuilder::scan_empty(
+            Some("employee_csv"),
+            &employee_schema(),
+            Some(vec![0, 1, 2, 3, 4]),
+        )?;
+        let col = Expr::Column(Column {
+            relation: None,
+            name: "first_name".to_owned(),
+        });

Review comment:
       ```suggestion
           let col = col("first_name");
   ```

##########
File path: datafusion/src/logical_plan/builder.rs
##########
@@ -1395,4 +1508,58 @@ mod tests {
         assert!(stringified_plan.should_display(true));
         assert!(!stringified_plan.should_display(false));
     }
+
+    #[test]
+    fn invalid_plan_with_not_non_boolean_expression() -> Result<()> {
+        let mut plan = LogicalPlanBuilder::scan_empty(
+            Some("employee_csv"),
+            &employee_schema(),
+            Some(vec![0, 1, 2, 3, 4]),
+        )?;
+        plan = plan.project(vec![Expr::Not(Box::new(Expr::Column(Column {
+            relation: None,
+            name: "id".to_string(),
+        })))])?;
+        match plan.build() {
+            Ok(_) => panic!("Unexpect Ok"),
+            Err(e) => {
+                assert_eq!(
+                    e.to_string(),
+                    "Error during planning: \
+                    Invalid Not Expression(NOT #employee_csv.id) -- \
+                    Type(Int32) of expression(#employee_csv.id) must be of a boolean type"
+                )
+            }
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn invalid_plan_cast_utf8_boolean() -> Result<()> {
+        let mut plan = LogicalPlanBuilder::scan_empty(
+            Some("employee_csv"),
+            &employee_schema(),
+            Some(vec![0, 1, 2, 3, 4]),
+        )?;
+        let col = Expr::Column(Column {
+            relation: None,
+            name: "first_name".to_owned(),
+        });
+        plan = plan.project(vec![Expr::Cast {
+            expr: Box::new(col),
+            data_type: DataType::Boolean,
+        }])?;
+        match plan.build() {
+            Ok(_) => panic!("Unexpect Ok"),
+            Err(e) => {
+                assert_eq!(
+                    e.to_string(),
+                    "Error during planning: \
+                    Invalid Cast Expression(CAST(#employee_csv.first_name AS Boolean)) -- \
+                    Utf8 cannot cast to Boolean"
+                )
+            }
+        }
+        Ok(())
+    }

Review comment:
       Can we also please add a test for using wildcards incorrectly -- like `SELECT .. GROUP BY *` and  `SELECT .. GROUP BY foo*`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org