You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/06/27 18:56:42 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #6778: Cleanup type coercion (#3419)

alamb commented on code in PR #6778:
URL: https://github.com/apache/arrow-datafusion/pull/6778#discussion_r1244197674


##########
datafusion/core/tests/sqllogictests/test_files/interval.slt:
##########
@@ -432,11 +432,15 @@ select '1 month'::interval + '1980-01-01T12:00:00'::timestamp;
 
 # Exected error: interval (scalar) - date / timestamp (scalar)

Review Comment:
   ```suggestion
   ```



##########
datafusion/optimizer/src/analyzer/type_coercion.rs:
##########
@@ -230,72 +230,15 @@ impl TreeNodeRewriter for TypeCoercionRewriter {
                 let expr = Expr::ILike(Like::new(negated, expr, pattern, escape_char));
                 Ok(expr)
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                op,
-                ref right,
-            }) => {
-                // this is a workaround for https://github.com/apache/arrow-datafusion/issues/3419
-                let left_type = left.get_type(&self.schema)?;
-                let right_type = right.get_type(&self.schema)?;
-                match (&left_type, &right_type) {
-                    // Handle some case about Interval.
-                    (
-                        DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _),
-                        &DataType::Interval(_),
-                    ) if matches!(op, Operator::Plus | Operator::Minus) => Ok(expr),
-                    (
-                        &DataType::Interval(_),
-                        DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _),
-                    ) if matches!(op, Operator::Plus) => Ok(expr),
-                    (DataType::Timestamp(_, _), DataType::Timestamp(_, _))
-                        if op.is_numerical_operators() =>
-                    {
-                        if matches!(op, Operator::Minus) {
-                            Ok(expr)
-                        } else {
-                            Err(DataFusionError::Internal(format!(
-                                "Unsupported operation {op:?} between {left_type:?} and {right_type:?}"
-                            )))
-                        }
-                    }
-                    // For numerical operations between decimals, we don't coerce the types.
-                    // But if only one of the operands is decimal, we cast the other operand to decimal
-                    // if the other operand is integer. If the other operand is float, we cast the
-                    // decimal operand to float.
-                    (lhs_type, rhs_type)
-                        if op.is_numerical_operators()
-                            && any_decimal(lhs_type, rhs_type) =>
-                    {
-                        let (coerced_lhs_type, coerced_rhs_type) =
-                            math_decimal_coercion(lhs_type, rhs_type);
-                        let new_left = if let Some(lhs_type) = coerced_lhs_type {
-                            left.clone().cast_to(&lhs_type, &self.schema)?
-                        } else {
-                            left.as_ref().clone()
-                        };
-                        let new_right = if let Some(rhs_type) = coerced_rhs_type {
-                            right.clone().cast_to(&rhs_type, &self.schema)?
-                        } else {
-                            right.as_ref().clone()
-                        };
-                        let expr = Expr::BinaryExpr(BinaryExpr::new(
-                            Box::new(new_left),
-                            op,
-                            Box::new(new_right),
-                        ));
-                        Ok(expr)
-                    }
-                    _ => {
-                        let common_type = coerce_types(&left_type, &op, &right_type)?;
-                        let expr = Expr::BinaryExpr(BinaryExpr::new(
-                            Box::new(left.clone().cast_to(&common_type, &self.schema)?),
-                            op,
-                            Box::new(right.clone().cast_to(&common_type, &self.schema)?),
-                        ));
-                        Ok(expr)
-                    }
-                }
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+                let lhs = left.get_type(&self.schema)?;

Review Comment:
   nit: the names `left_type` and `right_type` I think would be more consistent with the rest of the code in this file and more readable. The convention in the rest of the code seems to be `lhs` and `rhs` refer to the left and right argument `Expr` themselves
   
   The same comment applies to the other changes in this file too



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -499,6 +492,7 @@ pub fn coercion_decimal_mathematics_type(
     left_decimal_type: &DataType,
     right_decimal_type: &DataType,
 ) -> Option<DataType> {
+    // TODO: Move this logic into kernel implementations

Review Comment:
   FYI @viirya 



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -25,9 +25,144 @@ use arrow::datatypes::{
 use datafusion_common::DataFusionError;
 use datafusion_common::Result;
 
-use crate::type_coercion::{is_datetime, is_decimal, is_interval, is_numeric};
+use crate::type_coercion::is_numeric;
 use crate::Operator;
 
+/// The type signature of an instantiation of binary expression
+struct Signature {

Review Comment:
   I wonder if we can use a different name for this struture as there is already a `Signature` in https://docs.rs/datafusion-expr/26.0.0/datafusion_expr/struct.Signature.html
   
   Perhaps `OpSignature` or `BinarySignature` ?



##########
datafusion/optimizer/src/analyzer/type_coercion.rs:
##########
@@ -230,72 +230,15 @@ impl TreeNodeRewriter for TypeCoercionRewriter {
                 let expr = Expr::ILike(Like::new(negated, expr, pattern, escape_char));
                 Ok(expr)
             }
-            Expr::BinaryExpr(BinaryExpr {
-                ref left,
-                op,
-                ref right,
-            }) => {
-                // this is a workaround for https://github.com/apache/arrow-datafusion/issues/3419

Review Comment:
   cc @berkaysynnada 



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -319,8 +318,48 @@ fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<D
     }
 }
 
-/// Returns the output type of applying numeric operations such as `=`
-/// to arguments `lhs_type` and `rhs_type` if both are numeric
+/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation
+/// where one is numeric and one is `Utf8`/`LargeUtf8`.

Review Comment:
   ```suggestion
   /// where one is temporal and one is `Utf8`/`LargeUtf8`.
   ```



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -843,10 +829,13 @@ mod tests {
     #[test]
     fn test_coercion_error() -> Result<()> {
         let result_type =
-            coerce_types(&DataType::Float32, &Operator::Plus, &DataType::Utf8);
+            get_input_types(&DataType::Float32, &Operator::Plus, &DataType::Utf8);
 
         if let Err(DataFusionError::Plan(e)) = result_type {
-            assert_eq!(e, "Float32 + Utf8 can't be evaluated because there isn't a common type to coerce the types to");
+            assert_eq!(
+                e,
+                "Cannot coerce arithmetic expression Float32 + Utf8 to valid types"

Review Comment:
   I think the new messages are easier to understand 👍 



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -25,9 +25,144 @@ use arrow::datatypes::{
 use datafusion_common::DataFusionError;
 use datafusion_common::Result;
 
-use crate::type_coercion::{is_datetime, is_decimal, is_interval, is_numeric};
+use crate::type_coercion::is_numeric;
 use crate::Operator;
 
+/// The type signature of an instantiation of binary expression
+struct Signature {
+    /// The type to coerce the left argument to
+    lhs: DataType,
+    /// The type to coerce the right argument to
+    rhs: DataType,
+    /// The return type of the expression
+    ret: DataType,
+}
+
+impl Signature {
+    /// A signature where the inputs are coerced to the same type as the output
+    fn coerced(t: DataType) -> Self {

Review Comment:
   maybe this could be called "uniform" to better describe it -- coerce I think is the general term for automatic casting.



##########
datafusion/core/src/physical_planner.rs:
##########
@@ -2140,18 +2140,17 @@ mod tests {
     async fn errors() -> Result<()> {
         let bool_expr = col("c1").eq(col("c1"));
         let cases = vec![
-            // utf8 AND utf8
-            col("c1").and(col("c1")),
+            // utf8 = utf8
+            col("c1").eq(col("c1")),
             // u8 AND u8
-            col("c3").and(col("c3")),
-            // utf8 = bool
-            col("c1").eq(bool_expr.clone()),
-            // u32 AND bool
-            col("c2").and(bool_expr),
+            col("c3").bitand(col("c3")),
+            // utf8 = u8
+            col("c1").eq(col("c3")),
+            // bool AND bool
+            bool_expr.clone().and(bool_expr),
         ];

Review Comment:
   perhaps @izveigor  remembers 🤔 



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -1017,24 +989,27 @@ mod tests {
 
     macro_rules! test_coercion_binary_rule {
         ($A_TYPE:expr, $B_TYPE:expr, $OP:expr, $C_TYPE:expr) => {{
-            let result = coerce_types(&$A_TYPE, &$OP, &$B_TYPE)?;
-            assert_eq!(result, $C_TYPE);
+            let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?;
+            assert_eq!(lhs, $C_TYPE);
+            assert_eq!(rhs, $C_TYPE);
         }};
     }
 
     #[test]
     fn test_date_timestamp_arithmetic_error() -> Result<()> {
-        let common_type = coerce_types(
+        let (lhs, rhs) = get_input_types(
             &DataType::Timestamp(TimeUnit::Nanosecond, None),
             &Operator::Minus,
             &DataType::Timestamp(TimeUnit::Millisecond, None),
         )?;
-        assert_eq!(common_type.to_string(), "Timestamp(Millisecond, None)");
+        assert_eq!(lhs.to_string(), "Timestamp(Millisecond, None)");
+        assert_eq!(rhs.to_string(), "Timestamp(Millisecond, None)");
 
-        let err = coerce_types(&DataType::Date32, &Operator::Plus, &DataType::Date64)
-            .unwrap_err()
-            .to_string();
-        assert_contains!(&err, "Date32 + Date64 can't be evaluated because there isn't a common type to coerce the types to");
+        let (lhs, rhs) =
+            get_input_types(&DataType::Date32, &Operator::Plus, &DataType::Date64)

Review Comment:
   At one point I think the arrow cast kernels didn't support the coercion so perhaps that is why it was not done automatically



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -25,9 +25,144 @@ use arrow::datatypes::{
 use datafusion_common::DataFusionError;
 use datafusion_common::Result;
 
-use crate::type_coercion::{is_datetime, is_decimal, is_interval, is_numeric};
+use crate::type_coercion::is_numeric;
 use crate::Operator;
 
+/// The type signature of an instantiation of binary expression
+struct Signature {
+    /// The type to coerce the left argument to
+    lhs: DataType,
+    /// The type to coerce the right argument to
+    rhs: DataType,
+    /// The return type of the expression
+    ret: DataType,
+}
+
+impl Signature {
+    /// A signature where the inputs are coerced to the same type as the output
+    fn coerced(t: DataType) -> Self {
+        Self {
+            lhs: t.clone(),
+            rhs: t.clone(),
+            ret: t,
+        }
+    }
+
+    /// A signature where the inputs are coerced to the same type with a boolean output
+    fn comparison(t: DataType) -> Self {
+        Self {
+            lhs: t.clone(),
+            rhs: t,
+            ret: DataType::Boolean,
+        }
+    }
+}
+
+/// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs`

Review Comment:
   ❤️  This is perfect. Thank you



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org