You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/06/21 18:34:02 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #6662: feat: support `NULL` in array functions

alamb commented on code in PR #6662:
URL: https://github.com/apache/arrow-datafusion/pull/6662#discussion_r1237411053


##########
datafusion/expr/src/built_in_function.rs:
##########
@@ -440,46 +440,44 @@ impl BuiltinScalarFunction {
         // the return type of the built in function.
         // Some built-in functions' return type depends on the incoming type.
         match self {
-            BuiltinScalarFunction::ArrayAppend => match &input_expr_types[0] {
-                List(field) => Ok(List(Arc::new(Field::new(
-                    "item",
-                    field.data_type().clone(),
-                    true,
-                )))),
-                _ => Err(DataFusionError::Internal(format!(
-                    "The {self} function can only accept list as the first argument"
-                ))),
-            },
-            BuiltinScalarFunction::ArrayConcat => match &input_expr_types[0] {
-                List(field) => Ok(List(Arc::new(Field::new(
-                    "item",
-                    field.data_type().clone(),
-                    true,
-                )))),
-                _ => Err(DataFusionError::Internal(format!(
-                    "The {self} function can only accept fixed size list as the args."
-                ))),
-            },
-            BuiltinScalarFunction::ArrayDims => Ok(UInt8),
-            BuiltinScalarFunction::ArrayFill => Ok(List(Arc::new(Field::new(
+            BuiltinScalarFunction::ArrayAppend => Ok(List(Arc::new(Field::new(
                 "item",
-                input_expr_types[0].clone(),
+                input_expr_types[1].clone(),
                 true,
             )))),
+            BuiltinScalarFunction::ArrayConcat => {
+                let mut expr_type = Null;
+                for input_expr_type in input_expr_types {
+                    match input_expr_type {
+                        List(field) => {
+                            if !field.data_type().equals_datatype(&Null) {
+                                expr_type = field.data_type().clone();
+                                break;
+                            }
+                        }
+                        _ => {
+                            return Err(DataFusionError::Internal(format!(
+                                "The {self} function can only accept list as the args."
+                            )))
+                        }
+                    }
+                }
+
+                Ok(List(Arc::new(Field::new("item", expr_type, true))))
+            }
+            BuiltinScalarFunction::ArrayDims => Ok(UInt8),
+            BuiltinScalarFunction::ArrayFill => Ok(Null),

Review Comment:
   I don't understand why ArrayFill always returns `Null` as its data type
   
   My reading of https://www.postgresql.org/docs/9.1/functions-array.html suggests that it should return something like `List(args[0].type)`
   
   



##########
datafusion/optimizer/src/analyzer/type_coercion.rs:
##########
@@ -635,6 +635,20 @@ fn cast_expr(expr: &Expr, to_type: &DataType, schema: &DFSchema) -> Result<Expr>
     expr.clone().cast_to(to_type, schema)
 }
 
+/// Cast array `expr` to the specified type, if possible
+fn cast_array_expr(
+    expr: &Expr,
+    from_type: &DataType,
+    to_type: &DataType,
+    schema: &DFSchema,
+) -> Result<Expr> {
+    if from_type.equals_datatype(&DataType::Null) {

Review Comment:
   I would have thought we should be casting all the arguments that are `null` to the specific type of the rest of the arguments...



##########
datafusion/core/tests/sqllogictests/test_files/array.slt:
##########
@@ -99,9 +116,9 @@ select array_prepend(1, make_array(2, 3, 4)), array_prepend(1.0, make_array(2.0,
 [1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]
 
 # array_fill scalar function #1
-query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
+query error DataFusion error: SQL error: TokenizerError\("Unterminated string literal at Line: 2, Column 856"\)
 caused by
-Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
+Internal error: Optimizer rule 'simplify_expressions' failed, due to generate a different schema, original schema: DFSchema \{ fields: \[DFField \{ qualifier: None, field: Field \{ name: "array_fill\(Int64\(1\),make_array\(\)\)", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \} \}\], metadata: \{\} \}, new schema: DFSchema \{ fields: \[DFField \{ qualifier: None, field: Field \{ name: "array_fill\(Int64\(1\),make_array\(\)\)", data_type: List\(Field \{ name: "item", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: \{\} \} \}\], metadata: \{\} \}\. This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker

Review Comment:
   something seems wrong with this test



##########
datafusion/physical-expr/src/array_expressions.rs:
##########
@@ -22,12 +22,23 @@ use arrow::buffer::Buffer;
 use arrow::compute;
 use arrow::datatypes::{DataType, Field};
 use core::any::type_name;
-use datafusion_common::cast::as_list_array;
+use datafusion_common::cast::{as_generic_string_array, as_int64_array, as_list_array};
 use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::ColumnarValue;
 use std::sync::Arc;
 
+macro_rules! downcast_arg {

Review Comment:
   I think this is the same as `downcast_value`: https://docs.rs/datafusion-common/26.0.0/datafusion_common/macro.downcast_value.html



##########
datafusion/physical-expr/src/array_expressions.rs:
##########
@@ -56,20 +67,29 @@ macro_rules! new_builder {
 
 macro_rules! array {
     ($ARGS:expr, $ARRAY_TYPE:ident, $BUILDER_TYPE:ident) => {{
-        // downcast all arguments to their common format
-        let args =
-            downcast_vec!($ARGS, $ARRAY_TYPE).collect::<Result<Vec<&$ARRAY_TYPE>>>()?;
-
-        let builder = new_builder!($BUILDER_TYPE, args[0].len());
+        let builder = new_builder!($BUILDER_TYPE, $ARGS[0].len());
         let mut builder =
-            ListBuilder::<$BUILDER_TYPE>::with_capacity(builder, args.len());
+            ListBuilder::<$BUILDER_TYPE>::with_capacity(builder, $ARGS.len());
+
         // for each entry in the array
-        for index in 0..args[0].len() {
-            for arg in &args {
-                if arg.is_null(index) {
-                    builder.values().append_null();
-                } else {
-                    builder.values().append_value(arg.value(index));
+        for index in 0..$ARGS[0].len() {

Review Comment:
   I am not sure about this approach of taking either a `ListArray` or a `NullArray`
   
   In the other functions, the way NULL is treated is that the input types are always the same (in this case ListArray) and the values would be `null` (aka `array.is_valid(i)` would return false for rows that are null.
   
   Complicating matters is if you type a literal `null` in sql like:
   
   ```sql
   select array_concat([1,2], null)
   ```
   
   That comes to DataFusion as a `null` literal (with DataType::Null). The coercion / casting logic normally will coerce this to the appropriate type. 
   
   For example, here is how I think arithmetic works with null:
   
   ```sql
   select 1 + NULL
   ```
   
   Arrives like 
   ```sql
   ScalarValue::Int32(Some(1)) + ScalarValue::Null
   ```
   
   And then the coercion logic will add a cast to Int32:
   
   ```sql
   ScalarValue::Int32(Some(1)) + CAST(ScalarValue::Null, DataType::Int32)
   ```
   
   And then the constant folder will collapse this into:
   
   
   ```sql
   ScalarValue::Int32(Some(1)) + ScalarValue::Int32(None)
   ```
   
   So by the time the arithmetic kernel sees it, it only has to deal with arguments of `Int32`
   



##########
datafusion/expr/src/built_in_function.rs:
##########
@@ -440,46 +440,44 @@ impl BuiltinScalarFunction {
         // the return type of the built in function.
         // Some built-in functions' return type depends on the incoming type.
         match self {
-            BuiltinScalarFunction::ArrayAppend => match &input_expr_types[0] {
-                List(field) => Ok(List(Arc::new(Field::new(
-                    "item",
-                    field.data_type().clone(),
-                    true,
-                )))),
-                _ => Err(DataFusionError::Internal(format!(
-                    "The {self} function can only accept list as the first argument"
-                ))),
-            },
-            BuiltinScalarFunction::ArrayConcat => match &input_expr_types[0] {
-                List(field) => Ok(List(Arc::new(Field::new(
-                    "item",
-                    field.data_type().clone(),
-                    true,
-                )))),
-                _ => Err(DataFusionError::Internal(format!(
-                    "The {self} function can only accept fixed size list as the args."
-                ))),
-            },
-            BuiltinScalarFunction::ArrayDims => Ok(UInt8),
-            BuiltinScalarFunction::ArrayFill => Ok(List(Arc::new(Field::new(
+            BuiltinScalarFunction::ArrayAppend => Ok(List(Arc::new(Field::new(
                 "item",
-                input_expr_types[0].clone(),
+                input_expr_types[1].clone(),
                 true,
             )))),
+            BuiltinScalarFunction::ArrayConcat => {
+                let mut expr_type = Null;
+                for input_expr_type in input_expr_types {
+                    match input_expr_type {
+                        List(field) => {
+                            if !field.data_type().equals_datatype(&Null) {
+                                expr_type = field.data_type().clone();
+                                break;
+                            }
+                        }
+                        _ => {
+                            return Err(DataFusionError::Internal(format!(

Review Comment:
   I think Internal errors are only intended for bugs in DataFusion -- this error seems like it could come from bad user input too
   
   ```suggestion
                               return Err(DataFusionError::Plan(format!(
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org