You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/07/04 14:34:41 UTC

[spark] branch master updated: [SPARK-19799][SQL] Support WITH clause in subqueries

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new cad440d  [SPARK-19799][SQL] Support WITH clause in subqueries
cad440d is described below

commit cad440d1f59d9bb674e06b0f656d6bb9d7872bf9
Author: Peter Toth <pe...@gmail.com>
AuthorDate: Thu Jul 4 07:34:02 2019 -0700

    [SPARK-19799][SQL] Support WITH clause in subqueries
    
    ## What changes were proposed in this pull request?
    
    This PR  adds support of `WITH` clause within a subquery so this query becomes valid:
      ```
      SELECT max(c) FROM (
        WITH t AS (SELECT 1 AS c)
        SELECT * FROM t
      )
     ```
    
    ## How was this patch tested?
    
    Added new UTs.
    
    Closes #24831 from peter-toth/SPARK-19799-2.
    
    Authored-by: Peter Toth <pe...@gmail.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../apache/spark/sql/catalyst/parser/SqlBase.g4    |  10 +-
 .../spark/sql/catalyst/analysis/Analyzer.scala     |   7 +-
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  10 +-
 .../src/test/resources/sql-tests/inputs/cte.sql    | 109 +++++++++++
 .../test/resources/sql-tests/results/cte.sql.out   | 218 +++++++++++++++++++--
 5 files changed, 316 insertions(+), 38 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index e7e6a60..a1c1150 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -278,7 +278,7 @@ locationSpec
     ;
 
 query
-    : ctes? queryNoWith
+    : ctes? queryTerm queryOrganization
     ;
 
 insertInto
@@ -380,10 +380,6 @@ dmlStatementNoWith
     | fromClause multiInsertQueryBody+                                             #multiInsertQuery
     ;
 
-queryNoWith
-    : queryTerm queryOrganization
-    ;
-
 queryOrganization
     : (ORDER BY order+=sortItem (',' order+=sortItem)*)?
       (CLUSTER BY clusterBy+=expression (',' clusterBy+=expression)*)?
@@ -412,7 +408,7 @@ queryPrimary
     | fromStatement                                                         #fromStmt
     | TABLE multipartIdentifier                                             #table
     | inlineTable                                                           #inlineTableDefault1
-    | '(' queryNoWith  ')'                                                  #subquery
+    | '(' query ')'                                                         #subquery
     ;
 
 sortItem
@@ -583,7 +579,7 @@ identifierComment
 
 relationPrimary
     : multipartIdentifier sample? tableAlias  #tableName
-    | '(' queryNoWith ')' sample? tableAlias  #aliasedQuery
+    | '(' query ')' sample? tableAlias        #aliasedQuery
     | '(' relation ')' sample? tableAlias     #aliasedRelation
     | inlineTable                             #inlineTableDefault2
     | functionTable                           #tableValuedFunction
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 7b920e1..5d37e90 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -224,12 +224,13 @@ class Analyzer(
       case other => other
     }
 
-    def substituteCTE(plan: LogicalPlan, cteName: String, ctePlan: LogicalPlan): LogicalPlan = {
+    private def substituteCTE(
+        plan: LogicalPlan,
+        cteName: String,
+        ctePlan: LogicalPlan): LogicalPlan = {
       plan resolveOperatorsUp {
         case UnresolvedRelation(Seq(table)) if resolver(cteName, table) =>
           ctePlan
-        case u: UnresolvedRelation =>
-          u
         case other =>
           // This cannot be done in ResolveSubquery because ResolveSubquery does not know the CTE.
           other transformExpressions {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index f08cb2a..5eef8db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -111,7 +111,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    * Create a top-level plan with Common Table Expressions.
    */
   override def visitQuery(ctx: QueryContext): LogicalPlan = withOrigin(ctx) {
-    val query = plan(ctx.queryNoWith)
+    val query = plan(ctx.queryTerm).optionalMap(ctx.queryOrganization)(withQueryResultClauses)
 
     // Apply CTEs
     query.optionalMap(ctx.ctes)(withCTE)
@@ -175,10 +175,6 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     }
   }
 
-  override def visitQueryNoWith(ctx: QueryNoWithContext): LogicalPlan = withOrigin(ctx) {
-    plan(ctx.queryTerm).optionalMap(ctx.queryOrganization)(withQueryResultClauses)
-  }
-
   /**
    * Create a named logical plan.
    *
@@ -890,7 +886,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    * Create a logical plan for a sub-query.
    */
   override def visitSubquery(ctx: SubqueryContext): LogicalPlan = withOrigin(ctx) {
-    plan(ctx.queryNoWith)
+    plan(ctx.query)
   }
 
   /**
@@ -978,7 +974,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
    * }}}
    */
   override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) {
-    val relation = plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample)
+    val relation = plan(ctx.query).optionalMap(ctx.sample)(withSample)
     if (ctx.tableAlias.strictIdentifier == null) {
       // For un-aliased subqueries, use a default alias name that is not likely to conflict with
       // normal subquery names, so that parent operators can only access the columns in subquery by
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
index 822c5c4..ac448eb 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -4,6 +4,9 @@ create temporary view t2 as select * from values 0, 1 as t(id);
 -- WITH clause should not fall into infinite loop by referencing self
 WITH s AS (SELECT 1 FROM s) SELECT * FROM s;
 
+WITH r AS (SELECT (SELECT * FROM r))
+SELECT * FROM r;
+
 -- WITH clause should reference the base table
 WITH t AS (SELECT 1 FROM t) SELECT * FROM t;
 
@@ -28,6 +31,112 @@ FROM   CTE1 t1
 WITH t(x) AS (SELECT 1)
 SELECT * FROM t WHERE x = 1;
 
+-- CTE in CTE definition
+WITH t as (
+  WITH t2 AS (SELECT 1)
+  SELECT * FROM t2
+)
+SELECT * FROM t;
+
+-- CTE in subquery
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 1)
+  SELECT * FROM t
+);
+
+-- CTE in subquery expression
+SELECT (
+  WITH t AS (SELECT 1)
+  SELECT * FROM t
+);
+
+-- CTE in CTE definition shadows outer
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+SELECT * FROM t2;
+
+-- CTE in CTE definition shadows outer 2
+WITH
+  t(c) AS (SELECT 1),
+  t2 AS (
+    SELECT (
+      SELECT max(c) FROM (
+        WITH t(c) AS (SELECT 2)
+        SELECT * FROM t
+      )
+    )
+  )
+SELECT * FROM t2;
+
+-- CTE in CTE definition shadows outer 3
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2),
+    t2 AS (
+      WITH t AS (SELECT 3)
+      SELECT * FROM t
+    )
+    SELECT * FROM t2
+  )
+SELECT * FROM t2;
+
+-- CTE in subquery shadows outer
+WITH t(c) AS (SELECT 1)
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+);
+
+-- CTE in subquery shadows outer 2
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 2)
+    SELECT * FROM t
+  )
+);
+
+-- CTE in subquery shadows outer 3
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 3)
+    SELECT * FROM t
+  )
+);
+
+-- CTE in subquery expression shadows outer
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT * FROM t
+);
+
+-- CTE in subquery expression shadows outer 2
+WITH t AS (SELECT 1)
+SELECT (
+  SELECT (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+);
+
+-- CTE in subquery expression shadows outer 3
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT (
+    WITH t AS (SELECT 3)
+    SELECT * FROM t
+  )
+);
+
 -- Clean up
 DROP VIEW IF EXISTS t;
 DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index f8ccecb..b89e29f 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 10
+-- Number of queries: 23
 
 
 -- !query 0
@@ -28,36 +28,46 @@ Table or view not found: s; line 1 pos 25
 
 
 -- !query 3
-WITH t AS (SELECT 1 FROM t) SELECT * FROM t
+WITH r AS (SELECT (SELECT * FROM r))
+SELECT * FROM r
 -- !query 3 schema
-struct<1:int>
+struct<>
 -- !query 3 output
+org.apache.spark.sql.AnalysisException
+Table or view not found: r; line 1 pos 33
+
+
+-- !query 4
+WITH t AS (SELECT 1 FROM t) SELECT * FROM t
+-- !query 4 schema
+struct<1:int>
+-- !query 4 output
 1
 1
 1
 
 
--- !query 4
+-- !query 5
 WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2
--- !query 4 schema
+-- !query 5 schema
 struct<>
--- !query 4 output
+-- !query 5 output
 org.apache.spark.sql.AnalysisException
 Table or view not found: s2; line 1 pos 26
 
 
--- !query 5
+-- !query 6
 WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2
--- !query 5 schema
+-- !query 6 schema
 struct<id:int,2:int>
--- !query 5 output
+-- !query 6 output
 0	2
 0	2
 1	2
 1	2
 
 
--- !query 6
+-- !query 7
 WITH CTE1 AS (
   SELECT b.id AS id
   FROM   T2 a
@@ -67,9 +77,9 @@ SELECT t1.id AS c1,
        t2.id AS c2
 FROM   CTE1 t1
        CROSS JOIN CTE1 t2
--- !query 6 schema
+-- !query 7 schema
 struct<c1:int,c2:int>
--- !query 6 output
+-- !query 7 output
 0	0
 0	0
 0	0
@@ -88,26 +98,192 @@ struct<c1:int,c2:int>
 1	1
 
 
--- !query 7
+-- !query 8
 WITH t(x) AS (SELECT 1)
 SELECT * FROM t WHERE x = 1
--- !query 7 schema
+-- !query 8 schema
 struct<x:int>
--- !query 7 output
+-- !query 8 output
 1
 
 
--- !query 8
+-- !query 9
+WITH t as (
+  WITH t2 AS (SELECT 1)
+  SELECT * FROM t2
+)
+SELECT * FROM t
+-- !query 9 schema
+struct<1:int>
+-- !query 9 output
+1
+
+
+-- !query 10
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 1)
+  SELECT * FROM t
+)
+-- !query 10 schema
+struct<max(c):int>
+-- !query 10 output
+1
+
+
+-- !query 11
+SELECT (
+  WITH t AS (SELECT 1)
+  SELECT * FROM t
+)
+-- !query 11 schema
+struct<scalarsubquery():int>
+-- !query 11 output
+1
+
+
+-- !query 12
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+SELECT * FROM t2
+-- !query 12 schema
+struct<1:int>
+-- !query 12 output
+1
+
+
+-- !query 13
+WITH
+  t(c) AS (SELECT 1),
+  t2 AS (
+    SELECT (
+      SELECT max(c) FROM (
+        WITH t(c) AS (SELECT 2)
+        SELECT * FROM t
+      )
+    )
+  )
+SELECT * FROM t2
+-- !query 13 schema
+struct<scalarsubquery():int>
+-- !query 13 output
+1
+
+
+-- !query 14
+WITH
+  t AS (SELECT 1),
+  t2 AS (
+    WITH t AS (SELECT 2),
+    t2 AS (
+      WITH t AS (SELECT 3)
+      SELECT * FROM t
+    )
+    SELECT * FROM t2
+  )
+SELECT * FROM t2
+-- !query 14 schema
+struct<2:int>
+-- !query 14 output
+2
+
+
+-- !query 15
+WITH t(c) AS (SELECT 1)
+SELECT max(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query 15 schema
+struct<max(c):int>
+-- !query 15 output
+2
+
+
+-- !query 16
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 2)
+    SELECT * FROM t
+  )
+)
+-- !query 16 schema
+struct<sum(c):bigint>
+-- !query 16 output
+2
+
+
+-- !query 17
+WITH t(c) AS (SELECT 1)
+SELECT sum(c) FROM (
+  WITH t(c) AS (SELECT 2)
+  SELECT max(c) AS c FROM (
+    WITH t(c) AS (SELECT 3)
+    SELECT * FROM t
+  )
+)
+-- !query 17 schema
+struct<sum(c):bigint>
+-- !query 17 output
+3
+
+
+-- !query 18
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query 18 schema
+struct<scalarsubquery():int>
+-- !query 18 output
+1
+
+
+-- !query 19
+WITH t AS (SELECT 1)
+SELECT (
+  SELECT (
+    WITH t AS (SELECT 2)
+    SELECT * FROM t
+  )
+)
+-- !query 19 schema
+struct<scalarsubquery():int>
+-- !query 19 output
+1
+
+
+-- !query 20
+WITH t AS (SELECT 1)
+SELECT (
+  WITH t AS (SELECT 2)
+  SELECT (
+    WITH t AS (SELECT 3)
+    SELECT * FROM t
+  )
+)
+-- !query 20 schema
+struct<scalarsubquery():int>
+-- !query 20 output
+1
+
+
+-- !query 21
 DROP VIEW IF EXISTS t
--- !query 8 schema
+-- !query 21 schema
 struct<>
--- !query 8 output
+-- !query 21 output
 
 
 
--- !query 9
+-- !query 22
 DROP VIEW IF EXISTS t2
--- !query 9 schema
+-- !query 22 schema
 struct<>
--- !query 9 output
+-- !query 22 output
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org