You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2023/06/01 07:10:31 UTC

[spark] branch branch-3.4 updated: [SPARK-43760][SQL][3.4] Nullability of scalar subquery results

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new a2c915db50c [SPARK-43760][SQL][3.4] Nullability of scalar subquery results
a2c915db50c is described below

commit a2c915db50c76fee1290d8a6a6aab9f41100a60b
Author: Andrey Gubichev <an...@databricks.com>
AuthorDate: Thu Jun 1 15:10:11 2023 +0800

    [SPARK-43760][SQL][3.4] Nullability of scalar subquery results
    
    ### What changes were proposed in this pull request?
    
    Backport of https://github.com/apache/spark/pull/41287.
    
    Makes sure that the results of scalar subqueries are declared as nullable.
    
    ### Why are the changes needed?
    
    This is an existing correctness bug, see https://issues.apache.org/jira/browse/SPARK-43760
    
    ### Does this PR introduce _any_ user-facing change?
    
    Fixes a correctness issue, so it is user-facing.
    
    ### How was this patch tested?
    
    Query tests.
    
    Closes #41408 from agubichev/spark-43760-nullability-branch-3.4.
    
    Authored-by: Andrey Gubichev <an...@databricks.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/optimizer/subquery.scala     |  2 +-
 .../scalar-subquery/scalar-subquery-predicate.sql   | 10 ++++++++++
 .../scalar-subquery/scalar-subquery-select.sql      | 18 +++++++++++++++++-
 .../scalar-subquery-predicate.sql.out               | 15 +++++++++++++++
 .../scalar-subquery/scalar-subquery-select.sql.out  | 21 +++++++++++++++++++++
 5 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 52164512028..1d2f5602630 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -393,7 +393,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe
     val newExpression = expression.transformWithPruning(_.containsPattern(SCALAR_SUBQUERY)) {
       case s: ScalarSubquery if s.children.nonEmpty =>
         subqueries += s
-        s.plan.output.head
+        s.plan.output.head.withNullability(true)
     }
     newExpression.asInstanceOf[E]
   }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
index c8fe4bea642..e5551250dfe 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
@@ -398,3 +398,13 @@ HAVING t0a <
   FROM   t2
   WHERE  t2b <= t0b)
 );
+
+-- SPARK-43760: the result of the subquery can be NULL.
+select *
+from range(1, 3) t1
+where (select sum(c) from (
+        select t2.id * t2.id c
+        from range (1, 2) t2 where t1.id = t2.id
+        group by t2.id
+       )
+) is not null;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
index b62cd4b68a1..48d1594fa51 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
@@ -344,4 +344,20 @@ SELECT t0a, (SELECT sum(d) FROM
   SELECT sum(t2a) + t0a as d
   FROM   t2)
 )
-FROM t0;
\ No newline at end of file
+FROM t0;
+
+-- SPARK-43760: the result of the subquery can be NULL.
+select *
+from
+(
+ select t1.id c1, (
+                    select sum(c)
+                    from (
+                      select t2.id * t2.id c
+                      from range (1, 2) t2 where t1.id = t2.id
+                      group by t2.id
+                    )
+                   ) c2
+ from range (1, 3) t1
+) t
+where t.c2 is not null;
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
index d5dc0f83ef4..46c430d5ba7 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
@@ -648,3 +648,18 @@ HAVING t0a <
 struct<t0a:int,t0b:int>
 -- !query output
 1	1
+
+
+-- !query
+select *
+from range(1, 3) t1
+where (select sum(c) from (
+        select t2.id * t2.id c
+        from range (1, 2) t2 where t1.id = t2.id
+        group by t2.id
+       )
+) is not null
+-- !query schema
+struct<id:bigint>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
index 7e81df1e371..d92a32d2463 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
@@ -754,3 +754,24 @@ org.apache.spark.sql.AnalysisException
     "fragment" : "SELECT sum(t0a) as d\n  FROM   t1"
   } ]
 }
+
+
+-- !query
+select *
+from
+(
+ select t1.id c1, (
+                    select sum(c)
+                    from (
+                      select t2.id * t2.id c
+                      from range (1, 2) t2 where t1.id = t2.id
+                      group by t2.id
+                    )
+                   ) c2
+ from range (1, 3) t1
+) t
+where t.c2 is not null
+-- !query schema
+struct<c1:bigint,c2:bigint>
+-- !query output
+1	1


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org