You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/11 11:23:31 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #2510: fix `NULL column` evaluation, tests for same

alamb commented on code in PR #2510:
URL: https://github.com/apache/arrow-datafusion/pull/2510#discussion_r870180841


##########
datafusion/core/tests/sql/expr.rs:
##########
@@ -1203,121 +1203,54 @@ async fn nested_subquery() -> Result<()> {
 }
 
 #[tokio::test]
-async fn comparisons_with_null() -> Result<()> {
+async fn comparisons_with_null_lt() {
     let ctx = SessionContext::new();
-    // 1. Numeric comparison with NULL
-    let sql = "select column1 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+-------------------+",
-        "| t.column1 Lt NULL |",
-        "+-------------------+",
-        "|                   |",
-        "|                   |",
-        "+-------------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
 
-    let sql =
-        "select column1 <= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+---------------------+",
-        "| t.column1 LtEq NULL |",
-        "+---------------------+",
-        "|                     |",
-        "|                     |",
-        "+---------------------+",
+    // we expect all the following queries to yield a two null values
+    let cases = vec![
+        // 1. Numeric comparison with NULL

Review Comment:
   I wanted to test a bunch of different combinations, so I rewrote the test so I could check them all



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -1156,39 +1150,25 @@ impl BinaryExpr {
         array: &dyn Array,
         scalar: &ScalarValue,
     ) -> Result<Option<Result<ArrayRef>>> {
+        let bool_type = &DataType::Boolean;

Review Comment:
   changes here were to make the code more readable (so `cargo fmt` put it on the same line)



##########
datafusion/core/src/physical_plan/file_format/parquet.rs:
##########
@@ -1452,9 +1452,10 @@ mod tests {
             .enumerate()
             .map(|(i, g)| row_group_predicate(g, i))
             .collect::<Vec<_>>();
-        // no row group is filtered out because the predicate expression can't be evaluated
-        // when a null array is generated for a statistics column,
-        assert_eq!(row_group_filter, vec![true, true]);
+
+        // bool = NULL always evaluates to NULL (and thus will not

Review Comment:
   this is the only surprising change I thought



##########
datafusion/core/tests/sql/expr.rs:
##########
@@ -1203,121 +1203,54 @@ async fn nested_subquery() -> Result<()> {
 }
 
 #[tokio::test]
-async fn comparisons_with_null() -> Result<()> {
+async fn comparisons_with_null_lt() {
     let ctx = SessionContext::new();
-    // 1. Numeric comparison with NULL
-    let sql = "select column1 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+-------------------+",
-        "| t.column1 Lt NULL |",
-        "+-------------------+",
-        "|                   |",
-        "|                   |",
-        "+-------------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
 
-    let sql =
-        "select column1 <= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = vec![
-        "+---------------------+",
-        "| t.column1 LtEq NULL |",
-        "+---------------------+",
-        "|                     |",
-        "|                     |",
-        "+---------------------+",
+    // we expect all the following queries to yield a two null values
+    let cases = vec![
+        // 1. Numeric comparison with NULL
+        "select column1 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        "select column1 <= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        "select column1 > NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        "select column1 >= NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        "select column1 = NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        "select column1 != NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        // 1.1 Float value comparison with NULL
+        "select column3 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        // String comparison with NULL
+        "select column2 < NULL from (VALUES (1, 'foo' ,2.3), (2, 'bar', 5.4)) as t",
+        // Boolean comparison with NULL
+        "select column1 < NULL from (VALUES (true), (false)) as t",
+        // ----
+        // ---- same queries, reversed argument order (as they go through
+        // ---- a different evaluation path)
+        // ----
+
+        // 1. Numeric comparison with NULL

Review Comment:
   Tests below here are new. The tests above were pre-existing



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -1255,14 +1235,25 @@ impl BinaryExpr {
         scalar: &ScalarValue,
         array: &ArrayRef,
     ) -> Result<Option<Result<ArrayRef>>> {
+        let bool_type = &DataType::Boolean;
         let scalar_result = match &self.op {
-            Operator::Lt => binary_array_op_scalar!(array, scalar.clone(), gt),
-            Operator::LtEq => binary_array_op_scalar!(array, scalar.clone(), gt_eq),
-            Operator::Gt => binary_array_op_scalar!(array, scalar.clone(), lt),
-            Operator::GtEq => binary_array_op_scalar!(array, scalar.clone(), lt_eq),
-            Operator::Eq => binary_array_op_scalar!(array, scalar.clone(), eq),
+            Operator::Lt => {
+                binary_array_op_dyn_scalar!(array, scalar.clone(), gt, bool_type)

Review Comment:
   This is the actual fix -- to use `binary_array_op_dyn_scalar` which @WinkerDu  updated to handle null constants rather than `binary_array_op_scalar` which does not.
   
   I plan to look into removing `binary_array_op_scalar` completely as a follow on PR.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org