You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/07/20 06:52:56 UTC
spark git commit: [SPARK-24424][SQL] Support ANSI-SQL compliant syntax for GROUPING SET

Repository: spark
Updated Branches:
  refs/heads/master a5925c163 -> 2b91d9918


[SPARK-24424][SQL] Support ANSI-SQL compliant syntax for GROUPING SET

## What changes were proposed in this pull request?

Enhances the parser and analyzer to support ANSI compliant syntax for GROUPING SET. As part of this change we derive the grouping expressions from user supplied groupings in the grouping sets clause.

```SQL
SELECT c1, c2, max(c3)
FROM t1
GROUP BY GROUPING SETS ((c1), (c1, c2))
```

## How was this patch tested?
Added tests in SQLQueryTestSuite and ResolveGroupingAnalyticsSuite.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Dilip Biswal <db...@us.ibm.com>

Closes #21813 from dilipbiswal/spark-24424.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b91d991
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b91d991
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b91d991

Branch: refs/heads/master
Commit: 2b91d9918c8eaec6c32a502e2f08b63c475d3335
Parents: a5925c1
Author: Dilip Biswal <db...@us.ibm.com>
Authored: Thu Jul 19 23:52:53 2018 -0700
Committer: Xiao Li <ga...@gmail.com>
Committed: Thu Jul 19 23:52:53 2018 -0700

----------------------------------------------------------------------
 .../apache/spark/sql/catalyst/parser/SqlBase.g4 |   1 +
 .../spark/sql/catalyst/analysis/Analyzer.scala  |  22 +++-
 .../ResolveGroupingAnalyticsSuite.scala         |  28 +++++
 .../resources/sql-tests/inputs/grouping_set.sql |  36 ++++++
 .../sql-tests/results/grouping_set.sql.out      | 126 ++++++++++++++++++-
 5 files changed, 210 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2b91d991/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 1b43874..2aca10f 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -406,6 +406,7 @@ aggregation
       WITH kind=ROLLUP
     | WITH kind=CUBE
     | kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')')?
+    | GROUP BY kind=GROUPING SETS '(' groupingSet (',' groupingSet)* ')'
     ;
 
 groupingSet

http://git-wip-us.apache.org/repos/asf/spark/blob/2b91d991/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 7c5504d..957c468 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -442,17 +442,35 @@ class Analyzer(
         child: LogicalPlan): LogicalPlan = {
       val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
 
+      // In case of ANSI-SQL compliant syntax for GROUPING SETS, groupByExprs is optional and
+      // can be null. In such case, we derive the groupByExprs from the user supplied values for
+      // grouping sets.
+      val finalGroupByExpressions = if (groupByExprs == Nil) {
+        selectedGroupByExprs.flatten.foldLeft(Seq.empty[Expression]) { (result, currentExpr) =>
+          // Only unique expressions are included in the group by expressions and is determined
+          // based on their semantic equality. Example. grouping sets ((a * b), (b * a)) results
+          // in grouping expression (a * b)
+          if (result.find(_.semanticEquals(currentExpr)).isDefined) {
+            result
+          } else {
+            result :+ currentExpr
+          }
+        }
+      } else {
+        groupByExprs
+      }
+
       // Expand works by setting grouping expressions to null as determined by the
       // `selectedGroupByExprs`. To prevent these null values from being used in an aggregate
       // instead of the original value we need to create new aliases for all group by expressions
       // that will only be used for the intended purpose.
-      val groupByAliases = constructGroupByAlias(groupByExprs)
+      val groupByAliases = constructGroupByAlias(finalGroupByExpressions)
 
       val expand = constructExpand(selectedGroupByExprs, child, groupByAliases, gid)
       val groupingAttrs = expand.output.drop(child.output.length)
 
       val aggregations = constructAggregateExprs(
-        groupByExprs, aggregationExprs, groupByAliases, groupingAttrs, gid)
+        finalGroupByExpressions, aggregationExprs, groupByAliases, groupingAttrs, gid)
 
       Aggregate(groupingAttrs, aggregations, expand)
     }

http://git-wip-us.apache.org/repos/asf/spark/blob/2b91d991/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
index 553b159..8da4d7e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
@@ -91,6 +91,34 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest {
     assertAnalysisError(originalPlan3, Seq("doesn't show up in the GROUP BY list"))
   }
 
+  test("grouping sets with no explicit group by expressions") {
+    val originalPlan = GroupingSets(Seq(Seq(), Seq(unresolved_a), Seq(unresolved_a, unresolved_b)),
+      Nil, r1,
+      Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c))))
+    val expected = Aggregate(Seq(a, b, gid), Seq(a, b, count(c).as("count(c)")),
+      Expand(
+        Seq(Seq(a, b, c, nulInt, nulStr, 3), Seq(a, b, c, a, nulStr, 1), Seq(a, b, c, a, b, 0)),
+        Seq(a, b, c, a, b, gid),
+        Project(Seq(a, b, c, a.as("a"), b.as("b")), r1)))
+    checkAnalysis(originalPlan, expected)
+
+    // Computation of grouping expression should remove duplicate expression based on their
+    // semantics (semanticEqual).
+    val originalPlan2 = GroupingSets(Seq(Seq(Multiply(unresolved_a, Literal(2))),
+      Seq(Multiply(Literal(2), unresolved_a), unresolved_b)), Nil, r1,
+      Seq(UnresolvedAlias(Multiply(unresolved_a, Literal(2))),
+        unresolved_b, UnresolvedAlias(count(unresolved_c))))
+
+    val resultPlan = getAnalyzer(true).executeAndCheck(originalPlan2)
+    val gExpressions = resultPlan.asInstanceOf[Aggregate].groupingExpressions
+    assert(gExpressions.size == 3)
+    val firstGroupingExprAttrName =
+      gExpressions(0).asInstanceOf[AttributeReference].name.replaceAll("#[0-9]*", "#0")
+    assert(firstGroupingExprAttrName == "(a#0 * 2)")
+    assert(gExpressions(1).asInstanceOf[AttributeReference].name == "b")
+    assert(gExpressions(2).asInstanceOf[AttributeReference].name == VirtualColumn.groupingIdName)
+  }
+
   test("cube") {
     val originalPlan = Aggregate(Seq(Cube(Seq(unresolved_a, unresolved_b))),
       Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c))), r1)

http://git-wip-us.apache.org/repos/asf/spark/blob/2b91d991/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
index 3594283..6bbde9f 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
@@ -13,5 +13,41 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a));
 -- SPARK-17849: grouping set throws NPE #3
 SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c));
 
+-- Group sets without explicit group by
+SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1);
 
+-- Group sets without group by and with grouping
+SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1);
+
+-- Mutiple grouping within a grouping set
+SELECT c1, c2, Sum(c3), grouping__id
+FROM   (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c2 ) )
+HAVING GROUPING__ID > 1;
+
+-- Group sets without explicit group by
+SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2);
+
+-- Mutiple grouping within a grouping set
+SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2));
+
+-- complex expression in grouping sets
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b));
+
+-- complex expression in grouping sets
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b));
+
+-- more query constructs with grouping sets
+SELECT c1 AS col1, c2 AS col2
+FROM   (VALUES (1, 2), (3, 2)) t(c1, c2)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c1, c2 ) )
+HAVING col2 IS NOT NULL
+ORDER  BY -col1;
+
+-- negative tests - must have at least one grouping expression
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP;
+
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE;
+
+SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (());
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2b91d991/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
index edb38a5..34ab09c 100644
--- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 4
+-- Number of queries: 15
 
 
 -- !query 0
@@ -40,3 +40,127 @@ struct<a:string,b:string,c:string,count(d):bigint>
 NULL	NULL	3	1
 NULL	NULL	6	1
 NULL	NULL	9	1
+
+
+-- !query 4
+SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1)
+-- !query 4 schema
+struct<c1:string,sum(c2):bigint>
+-- !query 4 output
+x	10
+y	20
+
+
+-- !query 5
+SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1)
+-- !query 5 schema
+struct<c1:string,sum(c2):bigint,grouping(c1):tinyint>
+-- !query 5 output
+x	10	0
+y	20	0
+
+
+-- !query 6
+SELECT c1, c2, Sum(c3), grouping__id
+FROM   (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c2 ) )
+HAVING GROUPING__ID > 1
+-- !query 6 schema
+struct<c1:string,c2:string,sum(c3):bigint,grouping__id:int>
+-- !query 6 output
+NULL	a	10	2
+NULL	b	20	2
+
+
+-- !query 7
+SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2)
+-- !query 7 schema
+struct<grouping(c1):tinyint>
+-- !query 7 output
+0
+0
+1
+1
+
+
+-- !query 8
+SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2))
+-- !query 8 schema
+struct<c1:int>
+-- !query 8 output
+-1
+-1
+-3
+-3
+
+
+-- !query 9
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b))
+-- !query 9 schema
+struct<(a + b):int,b:int,sum(c):bigint>
+-- !query 9 output
+2	NULL	1
+4	NULL	2
+NULL	1	1
+NULL	2	2
+
+
+-- !query 10
+SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b))
+-- !query 10 schema
+struct<(a + b):int,b:int,sum(c):bigint>
+-- !query 10 output
+2	NULL	2
+4	NULL	4
+NULL	1	1
+NULL	2	2
+
+
+-- !query 11
+SELECT c1 AS col1, c2 AS col2
+FROM   (VALUES (1, 2), (3, 2)) t(c1, c2)
+GROUP  BY GROUPING SETS ( ( c1 ), ( c1, c2 ) )
+HAVING col2 IS NOT NULL
+ORDER  BY -col1
+-- !query 11 schema
+struct<col1:int,col2:int>
+-- !query 11 output
+3	2
+1	2
+
+
+-- !query 12
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+extraneous input 'ROLLUP' expecting <EOF>(line 1, pos 53)
+
+== SQL ==
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP
+-----------------------------------------------------^^^
+
+
+-- !query 13
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE
+-- !query 13 schema
+struct<>
+-- !query 13 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+extraneous input 'CUBE' expecting <EOF>(line 1, pos 53)
+
+== SQL ==
+SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE
+-----------------------------------------------------^^^
+
+
+-- !query 14
+SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (())
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+expression '`c1`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org