You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/04/26 22:33:13 UTC

[spark] branch branch-3.0 updated: [SPARK-31535][SQL] Fix nested CTE substitution

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 79444d6  [SPARK-31535][SQL] Fix nested CTE substitution
79444d6 is described below

commit 79444d6d6f1e63f28a27420b3165e1115c35c687
Author: Peter Toth <pe...@gmail.com>
AuthorDate: Sun Apr 26 15:31:32 2020 -0700

    [SPARK-31535][SQL] Fix nested CTE substitution
    
    ### What changes were proposed in this pull request?
    
    This PR fixes a CTE substitution issue so as to the following SQL return the correct empty result:
    ```
    WITH t(c) AS (SELECT 1)
    SELECT * FROM t
    WHERE c IN (
      WITH t(c) AS (SELECT 2)
      SELECT * FROM t
    )
    ```
    Before this PR the result was `1`.
    
    ### Why are the changes needed?
    To fix a correctness issue.
    
    ### Does this PR introduce any user-facing change?
    Yes, fixes a correctness issue.
    
    ### How was this patch tested?
    Added new test case.
    
    Closes #28318 from peter-toth/SPARK-31535-fix-nested-cte-substitution.
    
    Authored-by: Peter Toth <pe...@gmail.com>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
    (cherry picked from commit 4f53bfbbd55401d56dda2fbdbbbae4eea660ded6)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../sql/catalyst/analysis/CTESubstitution.scala    | 61 ++++++++--------------
 .../test/resources/sql-tests/inputs/cte-nested.sql |  8 +++
 .../resources/sql-tests/results/cte-legacy.sql.out | 15 +++++-
 .../resources/sql-tests/results/cte-nested.sql.out | 16 +++++-
 .../sql-tests/results/cte-nonlegacy.sql.out        | 15 +++++-
 5 files changed, 72 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
index c8078e1..d9fdb56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
@@ -31,28 +31,28 @@ object CTESubstitution extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = {
     LegacyBehaviorPolicy.withName(SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_POLICY)) match {
       case LegacyBehaviorPolicy.EXCEPTION =>
-        assertNoNameConflictsInCTE(plan, inTraverse = false)
-        traverseAndSubstituteCTE(plan, inTraverse = false)
+        assertNoNameConflictsInCTE(plan)
+        traverseAndSubstituteCTE(plan)
       case LegacyBehaviorPolicy.LEGACY =>
         legacyTraverseAndSubstituteCTE(plan)
       case LegacyBehaviorPolicy.CORRECTED =>
-        traverseAndSubstituteCTE(plan, inTraverse = false)
+        traverseAndSubstituteCTE(plan)
     }
   }
 
   /**
    * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through
-   * child, innerChildren and subquery for the current plan.
+   * child, innerChildren and subquery expressions for the current plan.
    */
   private def assertNoNameConflictsInCTE(
       plan: LogicalPlan,
-      inTraverse: Boolean,
-      cteNames: Set[String] = Set.empty): Unit = {
-    plan.foreach {
+      outerCTERelationNames: Set[String] = Set.empty,
+      namesInSubqueries: Set[String] = Set.empty): Unit = {
+    plan match {
       case w @ With(child, relations) =>
         val newNames = relations.map {
           case (cteName, _) =>
-            if (cteNames.contains(cteName)) {
+            if (outerCTERelationNames.contains(cteName)) {
               throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " +
                 s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " +
                 "defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " +
@@ -60,24 +60,15 @@ object CTESubstitution extends Rule[LogicalPlan] {
             } else {
               cteName
             }
-        }.toSet
-        child.transformExpressions {
-          case e: SubqueryExpression =>
-            assertNoNameConflictsInCTE(e.plan, inTraverse = true, cteNames ++ newNames)
-            e
-        }
-        w.innerChildren.foreach { p =>
-          assertNoNameConflictsInCTE(p, inTraverse = true, cteNames ++ newNames)
-        }
-
-      case other if inTraverse =>
-        other.transformExpressions {
-          case e: SubqueryExpression =>
-            assertNoNameConflictsInCTE(e.plan, inTraverse = true, cteNames)
-            e
-        }
+        }.toSet ++ namesInSubqueries
+        assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
+        w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, newNames))
 
-      case _ =>
+      case other =>
+        other.subqueries.foreach(
+          assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
+        other.children.foreach(
+          assertNoNameConflictsInCTE(_, outerCTERelationNames, namesInSubqueries))
     }
   }
 
@@ -131,37 +122,27 @@ object CTESubstitution extends Rule[LogicalPlan] {
    *     SELECT * FROM t
    *   )
    * @param plan the plan to be traversed
-   * @param inTraverse whether the current traverse is called from another traverse, only in this
-   *                   case name collision can occur
    * @return the plan where CTE substitution is applied
    */
-  private def traverseAndSubstituteCTE(plan: LogicalPlan, inTraverse: Boolean): LogicalPlan = {
+  private def traverseAndSubstituteCTE(plan: LogicalPlan): LogicalPlan = {
     plan.resolveOperatorsUp {
       case With(child: LogicalPlan, relations) =>
-        // child might contain an inner CTE that has priority so traverse and substitute inner CTEs
-        // in child first
-        val traversedChild: LogicalPlan = child transformExpressions {
-          case e: SubqueryExpression => e.withNewPlan(traverseAndSubstituteCTE(e.plan, true))
-        }
-
         // Substitute CTE definitions from last to first as a CTE definition can reference a
         // previous one
-        relations.foldRight(traversedChild) {
+        relations.foldRight(child) {
           case ((cteName, ctePlan), currentPlan) =>
             // A CTE definition might contain an inner CTE that has priority, so traverse and
             // substitute CTE defined in ctePlan.
             // A CTE definition might not be used at all or might be used multiple times. To avoid
             // computation if it is not used and to avoid multiple recomputation if it is used
             // multiple times we use a lazy construct with call-by-name parameter passing.
-            lazy val substitutedCTEPlan = traverseAndSubstituteCTE(ctePlan, true)
+            lazy val substitutedCTEPlan = traverseAndSubstituteCTE(ctePlan)
             substituteCTE(currentPlan, cteName, substitutedCTEPlan)
         }
 
-      // CTE name collision can occur only when inTraverse is true, it helps to avoid eager CTE
-      // substitution in a subquery expression.
-      case other if inTraverse =>
+      case other =>
         other.transformExpressions {
-          case e: SubqueryExpression => e.withNewPlan(traverseAndSubstituteCTE(e.plan, true))
+          case e: SubqueryExpression => e.withNewPlan(traverseAndSubstituteCTE(e.plan))
         }
     }
   }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
index 5e5e3a5..134f88b 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
@@ -103,3 +103,11 @@ SELECT (
     SELECT * FROM t
   )
 );
+
+-- CTE in subquery expression shadows outer 4
+WITH t(c) AS (SELECT 1)
+SELECT * FROM t
+WHERE c IN (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
index e8020e1..629daf7 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 12
+-- Number of queries: 13
 
 
 -- !query
@@ -166,3 +166,16 @@ SELECT (
 struct<scalarsubquery():int>
 -- !query output
 1
+
+
+-- !query
+WITH t(c) AS (SELECT 1)
+SELECT * FROM t
+WHERE c IN (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query schema
+struct<c:int>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
index 64d635a..34e9be0 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 12
+-- Number of queries: 13
 
 
 -- !query
@@ -172,3 +172,17 @@ struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
 Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH t(c) AS (SELECT 1)
+SELECT * FROM t
+WHERE c IN (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
index 9422bb6..6eba1ad 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 12
+-- Number of queries: 13
 
 
 -- !query
@@ -166,3 +166,16 @@ SELECT (
 struct<scalarsubquery():int>
 -- !query output
 3
+
+
+-- !query
+WITH t(c) AS (SELECT 1)
+SELECT * FROM t
+WHERE c IN (
+  WITH t(c) AS (SELECT 2)
+  SELECT * FROM t
+)
+-- !query schema
+struct<c:int>
+-- !query output
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org