You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/06/26 12:51:21 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #2794: Use coerced type in inlist expr planning

alamb commented on code in PR #2794:
URL: https://github.com/apache/arrow-datafusion/pull/2794#discussion_r906808831


##########
datafusion/expr/src/binary_rule.rs:
##########
@@ -166,6 +166,7 @@ pub fn comparison_eq_coercion(
         .or_else(|| temporal_coercion(lhs_type, rhs_type))
         .or_else(|| string_coercion(lhs_type, rhs_type))
         .or_else(|| null_coercion(lhs_type, rhs_type))
+        .or_else(|| string_numeric_coercion(lhs_type, rhs_type))

Review Comment:
   This makes a lot of sense -- thanks. 👍 



##########
datafusion/physical-expr/src/planner.rs:
##########
@@ -294,43 +292,18 @@ pub fn create_physical_expr(
                             input_schema,
                             execution_props,
                         ),
-                        // TODO refactor the logic of coercion the data type

Review Comment:
   cc @liukun4515 



##########
datafusion/expr/src/binary_rule.rs:
##########
@@ -185,6 +186,17 @@ fn comparison_order_coercion(
         .or_else(|| null_coercion(lhs_type, rhs_type))
 }
 
+fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {

Review Comment:
   👍 



##########
datafusion/physical-expr/src/planner.rs:
##########
@@ -339,3 +312,48 @@ pub fn create_physical_expr(
         ))),
     }
 }
+
+type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
+pub(crate) fn in_list_cast(
+    expr: Arc<dyn PhysicalExpr>,
+    list: Vec<Arc<dyn PhysicalExpr>>,
+    input_schema: &Schema,
+) -> Result<InListCastResult> {
+    let expr_type = &expr.data_type(input_schema)?;
+    let list_types: Vec<DataType> = list
+        .iter()
+        .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+        .collect();
+    let result_type = get_coerce_type(expr_type, &list_types);
+    match result_type {
+        None => Err(DataFusionError::Internal(format!(
+            "In expr can find the coerced type for {:?} in {:?}",

Review Comment:
   ```suggestion
               "Can not find compatible  types to compare {:?} with {:?}",
   ```



##########
datafusion/physical-expr/src/planner.rs:
##########
@@ -339,3 +312,48 @@ pub fn create_physical_expr(
         ))),
     }
 }
+
+type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
+pub(crate) fn in_list_cast(
+    expr: Arc<dyn PhysicalExpr>,
+    list: Vec<Arc<dyn PhysicalExpr>>,
+    input_schema: &Schema,
+) -> Result<InListCastResult> {
+    let expr_type = &expr.data_type(input_schema)?;
+    let list_types: Vec<DataType> = list
+        .iter()
+        .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+        .collect();
+    let result_type = get_coerce_type(expr_type, &list_types);
+    match result_type {
+        None => Err(DataFusionError::Internal(format!(
+            "In expr can find the coerced type for {:?} in {:?}",
+            expr_type, list_types
+        ))),
+        Some(data_type) => {
+            // find the coerced type
+            let cast_expr = try_cast(expr, input_schema, data_type.clone())?;
+            let cast_list_expr = list
+                .into_iter()
+                .map(|list_expr| {
+                    try_cast(list_expr, input_schema, data_type.clone()).unwrap()
+                })
+                .collect();
+            Ok((cast_expr, cast_list_expr))
+        }
+    }
+}
+
+fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option<DataType> {

Review Comment:
   ```suggestion
   /// Attempts to coerce the types of `list_type` to be comparable with the
   /// `expr_type`
   fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option<DataType> {
   ```



##########
datafusion/physical-expr/src/planner.rs:
##########
@@ -339,3 +312,48 @@ pub fn create_physical_expr(
         ))),
     }
 }
+
+type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
+pub(crate) fn in_list_cast(
+    expr: Arc<dyn PhysicalExpr>,
+    list: Vec<Arc<dyn PhysicalExpr>>,
+    input_schema: &Schema,
+) -> Result<InListCastResult> {
+    let expr_type = &expr.data_type(input_schema)?;
+    let list_types: Vec<DataType> = list
+        .iter()
+        .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+        .collect();
+    let result_type = get_coerce_type(expr_type, &list_types);
+    match result_type {
+        None => Err(DataFusionError::Internal(format!(

Review Comment:
   ```suggestion
           None => Err(DataFusionError::Plan(format!(
   ```
   
   It seems like this could be a normal error (due to the query, for example) rather than an internal error



##########
datafusion/core/src/physical_plan/planner.rs:
##########
@@ -1887,7 +1884,7 @@ mod tests {
             .project(vec![col("c1").in_list(list, false)])?
             .build()?;
         let execution_plan = plan(&logical_plan).await?;
-        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, CastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr
  { expr: Literal { value: Int64(9) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal 
 { value: Int64(18) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(2
 7) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }], negated: false, set: Some(InSet { set:";
+        let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cas
 t_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { e
 xpr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: None }";

Review Comment:
   Here is this expression formatted for anyone else who is interested:
   
   
   ```
   (InListExpr {
     expr: Column { name: \"c1\", index: 0 },
     list: [Literal { value: Utf8(\"a\") },
                    TryCastExpr { expr: Literal { value: Int64(1) },cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 },
                    TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, 
                    TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }],
     negated: false,
     set: None
     }
   ```



##########
datafusion/core/src/physical_plan/planner.rs:
##########
@@ -1739,8 +1739,6 @@ mod tests {
             col("c1").and(col("c1")),
             // u8 AND u8
             col("c3").and(col("c3")),
-            // utf8 = u32

Review Comment:
   Why remove this case?



##########
datafusion/physical-expr/src/planner.rs:
##########
@@ -294,43 +292,18 @@ pub fn create_physical_expr(
                             input_schema,
                             execution_props,
                         ),
-                        // TODO refactor the logic of coercion the data type
-                        // data type in the `list expr` may be conflict with `value expr`,
-                        // we should not just compare data type between `value expr` with each `list expr`.
-                        _ => {
-                            let list_expr = create_physical_expr(
-                                expr,
-                                input_dfschema,
-                                input_schema,
-                                execution_props,
-                            )?;
-                            let list_expr_data_type =
-                                list_expr.data_type(input_schema)?;
-
-                            if list_expr_data_type == value_expr_data_type {
-                                Ok(list_expr)
-                            } else if can_cast_types(
-                                &list_expr_data_type,
-                                &value_expr_data_type,
-                            ) {
-                                // TODO: Can't cast from list type to value type directly
-                                // We should use the coercion rule to get the common data type
-                                expressions::cast(
-                                    list_expr,
-                                    input_schema,
-                                    value_expr.data_type(input_schema)?,
-                                )
-                            } else {
-                                Err(DataFusionError::Plan(format!(
-                                    "Unsupported CAST from {:?} to {:?}",
-                                    list_expr_data_type, value_expr_data_type
-                                )))
-                            }
-                        }
+                        _ => create_physical_expr(
+                            expr,
+                            input_dfschema,
+                            input_schema,
+                            execution_props,
+                        ),
                     })
                     .collect::<Result<Vec<_>>>()?;
 
-                expressions::in_list(value_expr, list_exprs, negated, input_schema)
+                let (cast_expr, cast_list_exprs) =
+                    in_list_cast(value_expr, list_exprs, input_schema)?;

Review Comment:
   This is much better in my opinion. Thank you @viirya 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org