You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "avantgardnerio (via GitHub)" <gi...@apache.org> on 2023/02/24 16:54:55 UTC

[GitHub] [arrow-datafusion] avantgardnerio commented on a diff in pull request #5384: Bug fix: Window frame range value outside the type range

avantgardnerio commented on code in PR #5384:
URL: https://github.com/apache/arrow-datafusion/pull/5384#discussion_r1117290276


##########
datafusion/optimizer/src/type_coercion.rs:
##########
@@ -426,95 +426,128 @@ impl ExprRewriter for TypeCoercionRewriter {
     }
 }
 
-/// Casts the ScalarValue `value` to coerced type.
-// When coerced type is `Interval` we use `parse_interval` since `try_from_string` not
-// supports conversion from string to Interval
-fn convert_to_coerced_type(
-    coerced_type: &DataType,
-    value: &ScalarValue,
-) -> Result<ScalarValue> {
+/// Casts the given `value` to `target_type`. Note that this function
+/// only considers `Null` or `Utf8` values.
+fn coerce_scalar(target_type: &DataType, value: &ScalarValue) -> Result<ScalarValue> {
     match value {
-        // In here we do casting either for NULL types or
-        // ScalarValue::Utf8(Some(val)). The other types are already casted.
-        // The reason is that we convert the sqlparser result
-        // to the Utf8 for all possible cases. Hence the types other than Utf8
-        // are already casted to appropriate type. Therefore they can be returned directly.
+        // Coerce Utf8 values:
         ScalarValue::Utf8(Some(val)) => {
-            // we need special handling for Interval types
-            if let DataType::Interval(..) = coerced_type {
+            // When `target_type` is `Interval`, we use `parse_interval` since
+            // `try_from_string` does not support `String` to `Interval` coercions.
+            if let DataType::Interval(..) = target_type {
                 parse_interval("millisecond", val)
             } else {
-                ScalarValue::try_from_string(val.clone(), coerced_type)
+                ScalarValue::try_from_string(val.clone(), target_type)
             }
         }
         s => {
             if s.is_null() {
-                ScalarValue::try_from(coerced_type)
+                // Coerce `Null` values:
+                ScalarValue::try_from(target_type)
             } else {
+                // Values except `Utf8`/`Null` variants already have the right type
+                // (casted before) since we convert `sqlparser` outputs to `Utf8`
+                // for all possible cases. Therefore, we return a clone here.
                 Ok(s.clone())
             }
         }
     }
 }
 
+/// This function coerces `value` to `target_type` in a range-aware fashion.
+/// If the coercion is successful, we return an `Ok` value with the result.
+/// If the coercion fails because `target_type` is not wide enough (i.e. we
+/// can not coerce to `target_type`, but we can to a wider type in the same
+/// family), we return a `Null` value of this type to signal this situation.
+/// Downstream code uses this signal to treat these values as *unbounded*.
+fn coerce_scalar_range_aware(
+    target_type: &DataType,
+    value: &ScalarValue,
+) -> Result<ScalarValue> {
+    coerce_scalar(target_type, value).or_else(|err| {
+        // If type coercion fails, check if the largest type in family works:
+        if let Some(largest_type) = get_widest_type_in_family(target_type) {
+            coerce_scalar(largest_type, value).map_or_else(
+                |_| {
+                    Err(DataFusionError::Execution(format!(
+                        "Cannot cast {:?} to {:?}",
+                        value, target_type
+                    )))
+                },
+                |_| ScalarValue::try_from(target_type),

Review Comment:
   I don't understand - this is ignoring the result of `coerce_scalar()` and re-parsing it as `ScalarValue::try_from(target_type)`?



##########
datafusion/optimizer/src/type_coercion.rs:
##########
@@ -426,95 +426,128 @@ impl ExprRewriter for TypeCoercionRewriter {
     }
 }
 
-/// Casts the ScalarValue `value` to coerced type.
-// When coerced type is `Interval` we use `parse_interval` since `try_from_string` not
-// supports conversion from string to Interval
-fn convert_to_coerced_type(
-    coerced_type: &DataType,
-    value: &ScalarValue,
-) -> Result<ScalarValue> {
+/// Casts the given `value` to `target_type`. Note that this function
+/// only considers `Null` or `Utf8` values.
+fn coerce_scalar(target_type: &DataType, value: &ScalarValue) -> Result<ScalarValue> {
     match value {
-        // In here we do casting either for NULL types or
-        // ScalarValue::Utf8(Some(val)). The other types are already casted.
-        // The reason is that we convert the sqlparser result
-        // to the Utf8 for all possible cases. Hence the types other than Utf8
-        // are already casted to appropriate type. Therefore they can be returned directly.
+        // Coerce Utf8 values:
         ScalarValue::Utf8(Some(val)) => {
-            // we need special handling for Interval types
-            if let DataType::Interval(..) = coerced_type {
+            // When `target_type` is `Interval`, we use `parse_interval` since
+            // `try_from_string` does not support `String` to `Interval` coercions.
+            if let DataType::Interval(..) = target_type {
                 parse_interval("millisecond", val)
             } else {
-                ScalarValue::try_from_string(val.clone(), coerced_type)
+                ScalarValue::try_from_string(val.clone(), target_type)
             }
         }
         s => {
             if s.is_null() {
-                ScalarValue::try_from(coerced_type)
+                // Coerce `Null` values:
+                ScalarValue::try_from(target_type)
             } else {
+                // Values except `Utf8`/`Null` variants already have the right type
+                // (casted before) since we convert `sqlparser` outputs to `Utf8`
+                // for all possible cases. Therefore, we return a clone here.
                 Ok(s.clone())
             }
         }
     }
 }
 
+/// This function coerces `value` to `target_type` in a range-aware fashion.
+/// If the coercion is successful, we return an `Ok` value with the result.
+/// If the coercion fails because `target_type` is not wide enough (i.e. we
+/// can not coerce to `target_type`, but we can to a wider type in the same
+/// family), we return a `Null` value of this type to signal this situation.

Review Comment:
   Where is the `Null` introduced? `ScalarValue::try_from(target_type)`?
   
   Rust has `Option` and `Result` to indicate an operation was unsuccessful, is there a reason to use a valid value (Null) which callers must be aware of, vs returning one of the above to indicate an error? I could see how callers that did not read this documentation might continue on with the `Null` introducing subtle errors later in the call stack.



##########
datafusion/optimizer/src/type_coercion.rs:
##########
@@ -426,95 +426,128 @@ impl ExprRewriter for TypeCoercionRewriter {
     }
 }
 
-/// Casts the ScalarValue `value` to coerced type.
-// When coerced type is `Interval` we use `parse_interval` since `try_from_string` not
-// supports conversion from string to Interval
-fn convert_to_coerced_type(
-    coerced_type: &DataType,
-    value: &ScalarValue,
-) -> Result<ScalarValue> {
+/// Casts the given `value` to `target_type`. Note that this function
+/// only considers `Null` or `Utf8` values.
+fn coerce_scalar(target_type: &DataType, value: &ScalarValue) -> Result<ScalarValue> {
     match value {
-        // In here we do casting either for NULL types or
-        // ScalarValue::Utf8(Some(val)). The other types are already casted.
-        // The reason is that we convert the sqlparser result
-        // to the Utf8 for all possible cases. Hence the types other than Utf8
-        // are already casted to appropriate type. Therefore they can be returned directly.
+        // Coerce Utf8 values:
         ScalarValue::Utf8(Some(val)) => {
-            // we need special handling for Interval types
-            if let DataType::Interval(..) = coerced_type {
+            // When `target_type` is `Interval`, we use `parse_interval` since
+            // `try_from_string` does not support `String` to `Interval` coercions.
+            if let DataType::Interval(..) = target_type {
                 parse_interval("millisecond", val)
             } else {
-                ScalarValue::try_from_string(val.clone(), coerced_type)
+                ScalarValue::try_from_string(val.clone(), target_type)
             }
         }
         s => {
             if s.is_null() {
-                ScalarValue::try_from(coerced_type)
+                // Coerce `Null` values:
+                ScalarValue::try_from(target_type)
             } else {
+                // Values except `Utf8`/`Null` variants already have the right type
+                // (casted before) since we convert `sqlparser` outputs to `Utf8`
+                // for all possible cases. Therefore, we return a clone here.
                 Ok(s.clone())
             }
         }
     }
 }
 
+/// This function coerces `value` to `target_type` in a range-aware fashion.
+/// If the coercion is successful, we return an `Ok` value with the result.
+/// If the coercion fails because `target_type` is not wide enough (i.e. we
+/// can not coerce to `target_type`, but we can to a wider type in the same
+/// family), we return a `Null` value of this type to signal this situation.

Review Comment:
   Oh, it looks like this is because that's the existing behavior of `WindowFrameBound`?:
   
   ```
   pub enum WindowFrameBound {
       Preceding(ScalarValue),
       CurrentRow,
       Following(ScalarValue),
   }
   ```



##########
datafusion/expr/src/type_coercion.rs:
##########
@@ -62,17 +62,18 @@ pub fn is_numeric(dt: &DataType) -> bool {
         )
 }
 
-/// Determine if a DataType is Timestamp or not
+/// Determine whether the given data type `dt` is a `Timestamp`.
 pub fn is_timestamp(dt: &DataType) -> bool {
     matches!(dt, DataType::Timestamp(_, _))
 }
 
-/// Determine if a DataType is Date or not
+/// Determine whether the given data type `dt` is a `Date`.
 pub fn is_date(dt: &DataType) -> bool {
     matches!(dt, DataType::Date32 | DataType::Date64)
 }
 
-pub fn is_uft8(dt: &DataType) -> bool {
+/// Determine whether the given data type `dt` is a `Utf8`.
+pub fn is_utf8(dt: &DataType) -> bool {

Review Comment:
   Thanks for fixing this.



##########
datafusion/optimizer/src/type_coercion.rs:
##########
@@ -426,95 +426,128 @@ impl ExprRewriter for TypeCoercionRewriter {
     }
 }
 
-/// Casts the ScalarValue `value` to coerced type.
-// When coerced type is `Interval` we use `parse_interval` since `try_from_string` not

Review Comment:
   The issue was not introduced by this PR, but it seems like it would be a good idea to update `try_from_string()`...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org