You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2016/02/11 08:42:01 UTC

spark git commit: [SPARK-13235][SQL] Removed an Extra Distinct from the Plan when Using Union in SQL

Repository: spark
Updated Branches:
  refs/heads/master 1842c55d8 -> e88bff127


[SPARK-13235][SQL] Removed an Extra Distinct from the Plan when Using Union in SQL

Currently, the parser added two `Distinct` operators in the plan if we are using `Union` or `Union Distinct` in the SQL. This PR is to remove the extra `Distinct` from the plan.

For example, before the fix, the following query has a plan with two `Distinct`
```scala
sql("select * from t0 union select * from t0").explain(true)
```
```
== Parsed Logical Plan ==
'Project [unresolvedalias(*,None)]
+- 'Subquery u_2
   +- 'Distinct
      +- 'Project [unresolvedalias(*,None)]
         +- 'Subquery u_1
            +- 'Distinct
               +- 'Union
                  :- 'Project [unresolvedalias(*,None)]
                  :  +- 'UnresolvedRelation `t0`, None
                  +- 'Project [unresolvedalias(*,None)]
                     +- 'UnresolvedRelation `t0`, None

== Analyzed Logical Plan ==
id: bigint
Project [id#16L]
+- Subquery u_2
   +- Distinct
      +- Project [id#16L]
         +- Subquery u_1
            +- Distinct
               +- Union
                  :- Project [id#16L]
                  :  +- Subquery t0
                  :     +- Relation[id#16L] ParquetRelation
                  +- Project [id#16L]
                     +- Subquery t0
                        +- Relation[id#16L] ParquetRelation

== Optimized Logical Plan ==
Aggregate [id#16L], [id#16L]
+- Aggregate [id#16L], [id#16L]
   +- Union
      :- Project [id#16L]
      :  +- Relation[id#16L] ParquetRelation
      +- Project [id#16L]
         +- Relation[id#16L] ParquetRelation
```
After the fix, the plan is changed without the extra `Distinct` as follows:
```
== Parsed Logical Plan ==
'Project [unresolvedalias(*,None)]
+- 'Subquery u_1
   +- 'Distinct
      +- 'Union
         :- 'Project [unresolvedalias(*,None)]
         :  +- 'UnresolvedRelation `t0`, None
         +- 'Project [unresolvedalias(*,None)]
           +- 'UnresolvedRelation `t0`, None

== Analyzed Logical Plan ==
id: bigint
Project [id#17L]
+- Subquery u_1
   +- Distinct
      +- Union
        :- Project [id#16L]
        :  +- Subquery t0
        :     +- Relation[id#16L] ParquetRelation
        +- Project [id#16L]
          +- Subquery t0
          +- Relation[id#16L] ParquetRelation

== Optimized Logical Plan ==
Aggregate [id#17L], [id#17L]
+- Union
  :- Project [id#16L]
  :  +- Relation[id#16L] ParquetRelation
  +- Project [id#16L]
    +- Relation[id#16L] ParquetRelation
```

Author: gatorsmile <ga...@gmail.com>

Closes #11120 from gatorsmile/unionDistinct.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e88bff12
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e88bff12
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e88bff12

Branch: refs/heads/master
Commit: e88bff12795a6134e2e7204996b603e948380e18
Parents: 1842c55
Author: gatorsmile <ga...@gmail.com>
Authored: Thu Feb 11 08:40:27 2016 +0100
Committer: Herman van Hovell <hv...@questtec.nl>
Committed: Thu Feb 11 08:40:27 2016 +0100

----------------------------------------------------------------------
 .../spark/sql/catalyst/parser/SparkSqlParser.g  | 28 +----------------
 .../spark/sql/catalyst/CatalystQlSuite.scala    | 33 ++++++++++++++++++--
 2 files changed, 32 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e88bff12/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
index 9f2a5eb..24483cc 100644
--- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
+++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
@@ -2370,34 +2370,8 @@ setOpSelectStatement[CommonTree t, boolean topLevel]
     u=setOperator LPAREN b=simpleSelectStatement RPAREN
     |
     u=setOperator b=simpleSelectStatement)
-   -> {$setOpSelectStatement.tree != null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}?
-      ^(TOK_QUERY
-          ^(TOK_FROM
-            ^(TOK_SUBQUERY
-              ^($u {$setOpSelectStatement.tree} $b)
-              {adaptor.create(Identifier, generateUnionAlias())}
-             )
-          )
-          ^(TOK_INSERT
-             ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE))
-             ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF))
-          )
-       )
-   -> {$setOpSelectStatement.tree != null && $u.tree.getType()!=SparkSqlParser.TOK_UNIONDISTINCT}?
+   -> {$setOpSelectStatement.tree != null}?
       ^($u {$setOpSelectStatement.tree} $b)
-   -> {$setOpSelectStatement.tree == null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}?
-      ^(TOK_QUERY
-          ^(TOK_FROM
-            ^(TOK_SUBQUERY
-              ^($u {$t} $b)
-              {adaptor.create(Identifier, generateUnionAlias())}
-             )
-           )
-          ^(TOK_INSERT
-            ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE))
-            ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF))
-         )
-       )
    -> ^($u {$t} $b)
    )+
    o=orderByClause?

http://git-wip-us.apache.org/repos/asf/spark/blob/e88bff12/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
index 682b77d..8d7d6b5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction}
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.unsafe.types.CalendarInterval
 
 class CatalystQlSuite extends PlanTest {
@@ -45,6 +45,35 @@ class CatalystQlSuite extends PlanTest {
     comparePlans(parsed, expected)
   }
 
+  test("test Union Distinct operator") {
+    val parsed1 = parser.parsePlan("SELECT * FROM t0 UNION SELECT * FROM t1")
+    val parsed2 = parser.parsePlan("SELECT * FROM t0 UNION DISTINCT SELECT * FROM t1")
+    val expected =
+      Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+        Subquery("u_1",
+          Distinct(
+            Union(
+              Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+                UnresolvedRelation(TableIdentifier("t0"), None)),
+              Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+                UnresolvedRelation(TableIdentifier("t1"), None))))))
+    comparePlans(parsed1, expected)
+    comparePlans(parsed2, expected)
+  }
+
+  test("test Union All operator") {
+    val parsed = parser.parsePlan("SELECT * FROM t0 UNION ALL SELECT * FROM t1")
+    val expected =
+      Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+        Subquery("u_1",
+          Union(
+            Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+              UnresolvedRelation(TableIdentifier("t0"), None)),
+            Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+              UnresolvedRelation(TableIdentifier("t1"), None)))))
+    comparePlans(parsed, expected)
+  }
+
   test("support hive interval literal") {
     def checkInterval(sql: String, result: CalendarInterval): Unit = {
       val parsed = parser.parsePlan(sql)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org