You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/07/04 14:34:41 UTC
[spark] branch master updated: [SPARK-19799][SQL] Support WITH
clause in subqueries
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new cad440d [SPARK-19799][SQL] Support WITH clause in subqueries
cad440d is described below
commit cad440d1f59d9bb674e06b0f656d6bb9d7872bf9
Author: Peter Toth <pe...@gmail.com>
AuthorDate: Thu Jul 4 07:34:02 2019 -0700
[SPARK-19799][SQL] Support WITH clause in subqueries
## What changes were proposed in this pull request?
This PR adds support of `WITH` clause within a subquery so this query becomes valid:
```
SELECT max(c) FROM (
WITH t AS (SELECT 1 AS c)
SELECT * FROM t
)
```
## How was this patch tested?
Added new UTs.
Closes #24831 from peter-toth/SPARK-19799-2.
Authored-by: Peter Toth <pe...@gmail.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
.../apache/spark/sql/catalyst/parser/SqlBase.g4 | 10 +-
.../spark/sql/catalyst/analysis/Analyzer.scala | 7 +-
.../spark/sql/catalyst/parser/AstBuilder.scala | 10 +-
.../src/test/resources/sql-tests/inputs/cte.sql | 109 +++++++++++
.../test/resources/sql-tests/results/cte.sql.out | 218 +++++++++++++++++++--
5 files changed, 316 insertions(+), 38 deletions(-)
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index e7e6a60..a1c1150 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -278,7 +278,7 @@ locationSpec
;
query
- : ctes? queryNoWith
+ : ctes? queryTerm queryOrganization
;
insertInto
@@ -380,10 +380,6 @@ dmlStatementNoWith
| fromClause multiInsertQueryBody+ #multiInsertQuery
;
-queryNoWith
- : queryTerm queryOrganization
- ;
-
queryOrganization
: (ORDER BY order+=sortItem (',' order+=sortItem)*)?
(CLUSTER BY clusterBy+=expression (',' clusterBy+=expression)*)?
@@ -412,7 +408,7 @@ queryPrimary
| fromStatement #fromStmt
| TABLE multipartIdentifier #table
| inlineTable #inlineTableDefault1
- | '(' queryNoWith ')' #subquery
+ | '(' query ')' #subquery
;
sortItem
@@ -583,7 +579,7 @@ identifierComment
relationPrimary
: multipartIdentifier sample? tableAlias #tableName
- | '(' queryNoWith ')' sample? tableAlias #aliasedQuery
+ | '(' query ')' sample? tableAlias #aliasedQuery
| '(' relation ')' sample? tableAlias #aliasedRelation
| inlineTable #inlineTableDefault2
| functionTable #tableValuedFunction
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 7b920e1..5d37e90 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -224,12 +224,13 @@ class Analyzer(
case other => other
}
- def substituteCTE(plan: LogicalPlan, cteName: String, ctePlan: LogicalPlan): LogicalPlan = {
+ private def substituteCTE(
+ plan: LogicalPlan,
+ cteName: String,
+ ctePlan: LogicalPlan): LogicalPlan = {
plan resolveOperatorsUp {
case UnresolvedRelation(Seq(table)) if resolver(cteName, table) =>
ctePlan
- case u: UnresolvedRelation =>
- u
case other =>
// This cannot be done in ResolveSubquery because ResolveSubquery does not know the CTE.
other transformExpressions {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index f08cb2a..5eef8db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -111,7 +111,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* Create a top-level plan with Common Table Expressions.
*/
override def visitQuery(ctx: QueryContext): LogicalPlan = withOrigin(ctx) {
- val query = plan(ctx.queryNoWith)
+ val query = plan(ctx.queryTerm).optionalMap(ctx.queryOrganization)(withQueryResultClauses)
// Apply CTEs
query.optionalMap(ctx.ctes)(withCTE)
@@ -175,10 +175,6 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
}
}
- override def visitQueryNoWith(ctx: QueryNoWithContext): LogicalPlan = withOrigin(ctx) {
- plan(ctx.queryTerm).optionalMap(ctx.queryOrganization)(withQueryResultClauses)
- }
-
/**
* Create a named logical plan.
*
@@ -890,7 +886,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* Create a logical plan for a sub-query.
*/
override def visitSubquery(ctx: SubqueryContext): LogicalPlan = withOrigin(ctx) {
- plan(ctx.queryNoWith)
+ plan(ctx.query)
}
/**
@@ -978,7 +974,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* }}}
*/
override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) {
- val relation = plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample)
+ val relation = plan(ctx.query).optionalMap(ctx.sample)(withSample)
if (ctx.tableAlias.strictIdentifier == null) {
// For un-aliased subqueries, use a default alias name that is not likely to conflict with
// normal subquery names, so that parent operators can only access the columns in subquery by
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
index 822c5c4..ac448eb 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -4,6 +4,9 @@ create temporary view t2 as select * from values 0, 1 as t(id);
-- WITH clause should not fall into infinite loop by referencing self
WITH s AS (SELECT 1 FROM s) SELECT * FROM s;
+WITH r AS (SELECT (SELECT * FROM r))
+SELECT * FROM r;
+
-- WITH clause should reference the base table
WITH t AS (SELECT 1 FROM t) SELECT * FROM t;
@@ -28,6 +31,112 @@ FROM CTE1 t1
WITH t(x) AS (SELECT 1)
SELECT * FROM t WHERE x = 1;
+-- CTE in CTE definition
+WITH t as (
+ WITH t2 AS (SELECT 1)
+ SELECT * FROM t2
+)
+SELECT * FROM t;
+
+-- CTE in subquery
+SELECT max(c) FROM (
+ WITH t(c) AS (SELECT 1)
+ SELECT * FROM t
+);
+
+-- CTE in subquery expression
+SELECT (
+ WITH t AS (SELECT 1)
+ SELECT * FROM t
+);
+
+-- CTE in CTE definition shadows outer
+WITH
+ t AS (SELECT 1),
+ t2 AS (
+ WITH t AS (SELECT 2)
+ SELECT * FROM t
+ )
+SELECT * FROM t2;
+
+-- CTE in CTE definition shadows outer 2
+WITH
+ t(c) AS (SELECT 1),
+ t2 AS (
+ SELECT (
+ SELECT max(c) FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT * FROM t
+ )
+ )
+ )
+SELECT * FROM t2;
+
+-- CTE in CTE definition shadows outer 3
+WITH
+ t AS (SELECT 1),
+ t2 AS (
+ WITH t AS (SELECT 2),
+ t2 AS (
+ WITH t AS (SELECT 3)
+ SELECT * FROM t
+ )
+ SELECT * FROM t2
+ )
+SELECT * FROM t2;
+
+-- CTE in subquery shadows outer
+WITH t(c) AS (SELECT 1)
+SELECT max(c) FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT * FROM t
+);
+
+-- CTE in subquery shadows outer 2
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+ SELECT max(c) AS c FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT * FROM t
+ )
+);
+
+-- CTE in subquery shadows outer 3
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT max(c) AS c FROM (
+ WITH t(c) AS (SELECT 3)
+ SELECT * FROM t
+ )
+);
+
+-- CTE in subquery expression shadows outer
+WITH t AS (SELECT 1)
+SELECT (
+ WITH t AS (SELECT 2)
+ SELECT * FROM t
+);
+
+-- CTE in subquery expression shadows outer 2
+WITH t AS (SELECT 1)
+SELECT (
+ SELECT (
+ WITH t AS (SELECT 2)
+ SELECT * FROM t
+ )
+);
+
+-- CTE in subquery expression shadows outer 3
+WITH t AS (SELECT 1)
+SELECT (
+ WITH t AS (SELECT 2)
+ SELECT (
+ WITH t AS (SELECT 3)
+ SELECT * FROM t
+ )
+);
+
-- Clean up
DROP VIEW IF EXISTS t;
DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index f8ccecb..b89e29f 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 10
+-- Number of queries: 23
-- !query 0
@@ -28,36 +28,46 @@ Table or view not found: s; line 1 pos 25
-- !query 3
-WITH t AS (SELECT 1 FROM t) SELECT * FROM t
+WITH r AS (SELECT (SELECT * FROM r))
+SELECT * FROM r
-- !query 3 schema
-struct<1:int>
+struct<>
-- !query 3 output
+org.apache.spark.sql.AnalysisException
+Table or view not found: r; line 1 pos 33
+
+
+-- !query 4
+WITH t AS (SELECT 1 FROM t) SELECT * FROM t
+-- !query 4 schema
+struct<1:int>
+-- !query 4 output
1
1
1
--- !query 4
+-- !query 5
WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2
--- !query 4 schema
+-- !query 5 schema
struct<>
--- !query 4 output
+-- !query 5 output
org.apache.spark.sql.AnalysisException
Table or view not found: s2; line 1 pos 26
--- !query 5
+-- !query 6
WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2
--- !query 5 schema
+-- !query 6 schema
struct<id:int,2:int>
--- !query 5 output
+-- !query 6 output
0 2
0 2
1 2
1 2
--- !query 6
+-- !query 7
WITH CTE1 AS (
SELECT b.id AS id
FROM T2 a
@@ -67,9 +77,9 @@ SELECT t1.id AS c1,
t2.id AS c2
FROM CTE1 t1
CROSS JOIN CTE1 t2
--- !query 6 schema
+-- !query 7 schema
struct<c1:int,c2:int>
--- !query 6 output
+-- !query 7 output
0 0
0 0
0 0
@@ -88,26 +98,192 @@ struct<c1:int,c2:int>
1 1
--- !query 7
+-- !query 8
WITH t(x) AS (SELECT 1)
SELECT * FROM t WHERE x = 1
--- !query 7 schema
+-- !query 8 schema
struct<x:int>
--- !query 7 output
+-- !query 8 output
1
--- !query 8
+-- !query 9
+WITH t as (
+ WITH t2 AS (SELECT 1)
+ SELECT * FROM t2
+)
+SELECT * FROM t
+-- !query 9 schema
+struct<1:int>
+-- !query 9 output
+1
+
+
+-- !query 10
+SELECT max(c) FROM (
+ WITH t(c) AS (SELECT 1)
+ SELECT * FROM t
+)
+-- !query 10 schema
+struct<max(c):int>
+-- !query 10 output
+1
+
+
+-- !query 11
+SELECT (
+ WITH t AS (SELECT 1)
+ SELECT * FROM t
+)
+-- !query 11 schema
+struct<scalarsubquery():int>
+-- !query 11 output
+1
+
+
+-- !query 12
+WITH
+ t AS (SELECT 1),
+ t2 AS (
+ WITH t AS (SELECT 2)
+ SELECT * FROM t
+ )
+SELECT * FROM t2
+-- !query 12 schema
+struct<1:int>
+-- !query 12 output
+1
+
+
+-- !query 13
+WITH
+ t(c) AS (SELECT 1),
+ t2 AS (
+ SELECT (
+ SELECT max(c) FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT * FROM t
+ )
+ )
+ )
+SELECT * FROM t2
+-- !query 13 schema
+struct<scalarsubquery():int>
+-- !query 13 output
+1
+
+
+-- !query 14
+WITH
+ t AS (SELECT 1),
+ t2 AS (
+ WITH t AS (SELECT 2),
+ t2 AS (
+ WITH t AS (SELECT 3)
+ SELECT * FROM t
+ )
+ SELECT * FROM t2
+ )
+SELECT * FROM t2
+-- !query 14 schema
+struct<2:int>
+-- !query 14 output
+2
+
+
+-- !query 15
+WITH t(c) AS (SELECT 1)
+SELECT max(c) FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT * FROM t
+)
+-- !query 15 schema
+struct<max(c):int>
+-- !query 15 output
+2
+
+
+-- !query 16
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+ SELECT max(c) AS c FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT * FROM t
+ )
+)
+-- !query 16 schema
+struct<sum(c):bigint>
+-- !query 16 output
+2
+
+
+-- !query 17
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+ WITH t(c) AS (SELECT 2)
+ SELECT max(c) AS c FROM (
+ WITH t(c) AS (SELECT 3)
+ SELECT * FROM t
+ )
+)
+-- !query 17 schema
+struct<sum(c):bigint>
+-- !query 17 output
+3
+
+
+-- !query 18
+WITH t AS (SELECT 1)
+SELECT (
+ WITH t AS (SELECT 2)
+ SELECT * FROM t
+)
+-- !query 18 schema
+struct<scalarsubquery():int>
+-- !query 18 output
+1
+
+
+-- !query 19
+WITH t AS (SELECT 1)
+SELECT (
+ SELECT (
+ WITH t AS (SELECT 2)
+ SELECT * FROM t
+ )
+)
+-- !query 19 schema
+struct<scalarsubquery():int>
+-- !query 19 output
+1
+
+
+-- !query 20
+WITH t AS (SELECT 1)
+SELECT (
+ WITH t AS (SELECT 2)
+ SELECT (
+ WITH t AS (SELECT 3)
+ SELECT * FROM t
+ )
+)
+-- !query 20 schema
+struct<scalarsubquery():int>
+-- !query 20 output
+1
+
+
+-- !query 21
DROP VIEW IF EXISTS t
--- !query 8 schema
+-- !query 21 schema
struct<>
--- !query 8 output
+-- !query 21 output
--- !query 9
+-- !query 22
DROP VIEW IF EXISTS t2
--- !query 9 schema
+-- !query 22 schema
struct<>
--- !query 9 output
+-- !query 22 output
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org