You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2023/06/01 07:10:31 UTC
[spark] branch branch-3.4 updated: [SPARK-43760][SQL][3.4] Nullability of scalar subquery results
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new a2c915db50c [SPARK-43760][SQL][3.4] Nullability of scalar subquery results
a2c915db50c is described below
commit a2c915db50c76fee1290d8a6a6aab9f41100a60b
Author: Andrey Gubichev <an...@databricks.com>
AuthorDate: Thu Jun 1 15:10:11 2023 +0800
[SPARK-43760][SQL][3.4] Nullability of scalar subquery results
### What changes were proposed in this pull request?
Backport of https://github.com/apache/spark/pull/41287.
Makes sure that the results of scalar subqueries are declared as nullable.
### Why are the changes needed?
This is an existing correctness bug, see https://issues.apache.org/jira/browse/SPARK-43760
### Does this PR introduce _any_ user-facing change?
Fixes a correctness issue, so it is user-facing.
### How was this patch tested?
Query tests.
Closes #41408 from agubichev/spark-43760-nullability-branch-3.4.
Authored-by: Andrey Gubichev <an...@databricks.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../spark/sql/catalyst/optimizer/subquery.scala | 2 +-
.../scalar-subquery/scalar-subquery-predicate.sql | 10 ++++++++++
.../scalar-subquery/scalar-subquery-select.sql | 18 +++++++++++++++++-
.../scalar-subquery-predicate.sql.out | 15 +++++++++++++++
.../scalar-subquery/scalar-subquery-select.sql.out | 21 +++++++++++++++++++++
5 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 52164512028..1d2f5602630 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -393,7 +393,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe
val newExpression = expression.transformWithPruning(_.containsPattern(SCALAR_SUBQUERY)) {
case s: ScalarSubquery if s.children.nonEmpty =>
subqueries += s
- s.plan.output.head
+ s.plan.output.head.withNullability(true)
}
newExpression.asInstanceOf[E]
}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
index c8fe4bea642..e5551250dfe 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
@@ -398,3 +398,13 @@ HAVING t0a <
FROM t2
WHERE t2b <= t0b)
);
+
+-- SPARK-43760: the result of the subquery can be NULL.
+select *
+from range(1, 3) t1
+where (select sum(c) from (
+ select t2.id * t2.id c
+ from range (1, 2) t2 where t1.id = t2.id
+ group by t2.id
+ )
+) is not null;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
index b62cd4b68a1..48d1594fa51 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
@@ -344,4 +344,20 @@ SELECT t0a, (SELECT sum(d) FROM
SELECT sum(t2a) + t0a as d
FROM t2)
)
-FROM t0;
\ No newline at end of file
+FROM t0;
+
+-- SPARK-43760: the result of the subquery can be NULL.
+select *
+from
+(
+ select t1.id c1, (
+ select sum(c)
+ from (
+ select t2.id * t2.id c
+ from range (1, 2) t2 where t1.id = t2.id
+ group by t2.id
+ )
+ ) c2
+ from range (1, 3) t1
+) t
+where t.c2 is not null;
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
index d5dc0f83ef4..46c430d5ba7 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
@@ -648,3 +648,18 @@ HAVING t0a <
struct<t0a:int,t0b:int>
-- !query output
1 1
+
+
+-- !query
+select *
+from range(1, 3) t1
+where (select sum(c) from (
+ select t2.id * t2.id c
+ from range (1, 2) t2 where t1.id = t2.id
+ group by t2.id
+ )
+) is not null
+-- !query schema
+struct<id:bigint>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
index 7e81df1e371..d92a32d2463 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
@@ -754,3 +754,24 @@ org.apache.spark.sql.AnalysisException
"fragment" : "SELECT sum(t0a) as d\n FROM t1"
} ]
}
+
+
+-- !query
+select *
+from
+(
+ select t1.id c1, (
+ select sum(c)
+ from (
+ select t2.id * t2.id c
+ from range (1, 2) t2 where t1.id = t2.id
+ group by t2.id
+ )
+ ) c2
+ from range (1, 3) t1
+) t
+where t.c2 is not null
+-- !query schema
+struct<c1:bigint,c2:bigint>
+-- !query output
+1 1
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org