You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2017/10/20 16:27:44 UTC
spark git commit: [SPARK-21055][SQL] replace grouping__id with
grouping_id()
Repository: spark
Updated Branches:
refs/heads/master e2fea8cd6 -> 16c9cc68c
[SPARK-21055][SQL] replace grouping__id with grouping_id()
## What changes were proposed in this pull request?
spark does not support grouping__id, it has grouping_id() instead.
But it is not convenient for hive user to change to spark-sql
so this pr is to replace grouping__id with grouping_id()
hive user need not to alter their scripts
## How was this patch tested?
test with SQLQuerySuite.scala
Author: CenYuhai <yu...@ele.me>
Closes #18270 from cenyuhai/SPARK-21055.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16c9cc68
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16c9cc68
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16c9cc68
Branch: refs/heads/master
Commit: 16c9cc68c5a70fd50e214f6deba591f0a9ae5cca
Parents: e2fea8c
Author: CenYuhai <yu...@ele.me>
Authored: Fri Oct 20 09:27:39 2017 -0700
Committer: gatorsmile <ga...@gmail.com>
Committed: Fri Oct 20 09:27:39 2017 -0700
----------------------------------------------------------------------
.../spark/sql/catalyst/analysis/Analyzer.scala | 15 +--
.../sql-tests/inputs/group-analytics.sql | 6 +-
.../sql-tests/results/group-analytics.sql.out | 43 +++++---
.../sql/hive/execution/SQLQuerySuite.scala | 110 +++++++++++++++++++
4 files changed, 148 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/16c9cc68/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8edf575..d6a962a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.objects.{LambdaVariable, MapObjects, NewInstance, UnresolvedMapObjects}
import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, _}
+import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.sql.catalyst.util.toPrettySQL
@@ -293,12 +293,6 @@ class Analyzer(
Seq(Seq.empty)
}
- private def hasGroupingAttribute(expr: Expression): Boolean = {
- expr.collectFirst {
- case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => u
- }.isDefined
- }
-
private[analysis] def hasGroupingFunction(e: Expression): Boolean = {
e.collectFirst {
case g: Grouping => g
@@ -452,9 +446,6 @@ class Analyzer(
// This require transformUp to replace grouping()/grouping_id() in resolved Filter/Sort
def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
case a if !a.childrenResolved => a // be sure all of the children are resolved.
- case p if p.expressions.exists(hasGroupingAttribute) =>
- failAnalysis(
- s"${VirtualColumn.hiveGroupingIdName} is deprecated; use grouping_id() instead")
// Ensure group by expressions and aggregate expressions have been resolved.
case Aggregate(Seq(c @ Cube(groupByExprs)), aggregateExpressions, child)
@@ -1174,6 +1165,10 @@ class Analyzer(
case q: LogicalPlan =>
q transformExpressions {
case u if !u.childrenResolved => u // Skip until children are resolved.
+ case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) =>
+ withPosition(u) {
+ Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+ }
case u @ UnresolvedGenerator(name, children) =>
withPosition(u) {
catalog.lookupFunction(name, children) match {
http://git-wip-us.apache.org/repos/asf/spark/blob/16c9cc68/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
index 8aff4cb..9721f8c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
@@ -38,11 +38,11 @@ SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year)
GROUP BY CUBE(course, year);
SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year;
SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year;
-SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year);
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year;
-- GROUPING/GROUPING_ID in having clause
SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
-HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0;
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year;
SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0;
SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0;
SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0;
@@ -54,7 +54,7 @@ SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(co
ORDER BY GROUPING(course), GROUPING(year), course, year;
SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course);
SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course);
-SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id;
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year;
-- Aliases in SELECT could be used in ROLLUP/CUBE/GROUPING SETS
SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2);
http://git-wip-us.apache.org/repos/asf/spark/blob/16c9cc68/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
index ce7a16a..3439a05 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
@@ -223,22 +223,29 @@ grouping_id() can only be used with GroupingSets/Cube/Rollup;
-- !query 16
-SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year)
+SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year
-- !query 16 schema
-struct<>
+struct<course:string,year:int,grouping__id:int>
-- !query 16 output
-org.apache.spark.sql.AnalysisException
-grouping__id is deprecated; use grouping_id() instead;
+Java 2012 0
+Java 2013 0
+dotNET 2012 0
+dotNET 2013 0
+Java NULL 1
+dotNET NULL 1
+NULL 2012 2
+NULL 2013 2
+NULL NULL 3
-- !query 17
SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
-HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0
+HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year
-- !query 17 schema
struct<course:string,year:int>
-- !query 17 output
-Java NULL
NULL NULL
+Java NULL
dotNET NULL
@@ -263,10 +270,13 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
-- !query 20
SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0
-- !query 20 schema
-struct<>
+struct<course:string,year:int>
-- !query 20 output
-org.apache.spark.sql.AnalysisException
-grouping__id is deprecated; use grouping_id() instead;
+Java NULL
+NULL 2012
+NULL 2013
+NULL NULL
+dotNET NULL
-- !query 21
@@ -322,12 +332,19 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
-- !query 25
-SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id
+SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year
-- !query 25 schema
-struct<>
+struct<course:string,year:int>
-- !query 25 output
-org.apache.spark.sql.AnalysisException
-grouping__id is deprecated; use grouping_id() instead;
+Java 2012
+Java 2013
+dotNET 2012
+dotNET 2013
+Java NULL
+dotNET NULL
+NULL 2012
+NULL 2013
+NULL NULL
-- !query 26
http://git-wip-us.apache.org/repos/asf/spark/blob/16c9cc68/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 60935c3..2476a44 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1388,6 +1388,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
).map(i => Row(i._1, i._2, i._3)))
}
+ test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #1") {
+ checkAnswer(sql(
+ "SELECT count(*) AS cnt, key % 5, grouping__id FROM src GROUP BY key%5 WITH ROLLUP"),
+ Seq(
+ (113, 3, 0),
+ (91, 0, 0),
+ (500, null, 1),
+ (84, 1, 0),
+ (105, 2, 0),
+ (107, 4, 0)
+ ).map(i => Row(i._1, i._2, i._3)))
+ }
+
test("SPARK-8976 Wrong Result for Rollup #2") {
checkAnswer(sql(
"""
@@ -1409,6 +1422,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
).map(i => Row(i._1, i._2, i._3, i._4)))
}
+ test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #2") {
+ checkAnswer(sql(
+ """
+ |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+ |FROM src GROUP BY key%5, key-5
+ |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin),
+ Seq(
+ (1, 0, 5, 0),
+ (1, 0, 15, 0),
+ (1, 0, 25, 0),
+ (1, 0, 60, 0),
+ (1, 0, 75, 0),
+ (1, 0, 80, 0),
+ (1, 0, 100, 0),
+ (1, 0, 140, 0),
+ (1, 0, 145, 0),
+ (1, 0, 150, 0)
+ ).map(i => Row(i._1, i._2, i._3, i._4)))
+ }
+
test("SPARK-8976 Wrong Result for Rollup #3") {
checkAnswer(sql(
"""
@@ -1430,6 +1464,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
).map(i => Row(i._1, i._2, i._3, i._4)))
}
+ test("SPARK-21055 replace grouping__id: Wrong Result for Rollup #3") {
+ checkAnswer(sql(
+ """
+ |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+ |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+ |WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin),
+ Seq(
+ (1, 0, 5, 0),
+ (1, 0, 15, 0),
+ (1, 0, 25, 0),
+ (1, 0, 60, 0),
+ (1, 0, 75, 0),
+ (1, 0, 80, 0),
+ (1, 0, 100, 0),
+ (1, 0, 140, 0),
+ (1, 0, 145, 0),
+ (1, 0, 150, 0)
+ ).map(i => Row(i._1, i._2, i._3, i._4)))
+ }
+
test("SPARK-8976 Wrong Result for CUBE #1") {
checkAnswer(sql(
"SELECT count(*) AS cnt, key % 5, grouping_id() FROM src GROUP BY key%5 WITH CUBE"),
@@ -1443,6 +1498,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
).map(i => Row(i._1, i._2, i._3)))
}
+ test("SPARK-21055 replace grouping__id: Wrong Result for CUBE #1") {
+ checkAnswer(sql(
+ "SELECT count(*) AS cnt, key % 5, grouping__id FROM src GROUP BY key%5 WITH CUBE"),
+ Seq(
+ (113, 3, 0),
+ (91, 0, 0),
+ (500, null, 1),
+ (84, 1, 0),
+ (105, 2, 0),
+ (107, 4, 0)
+ ).map(i => Row(i._1, i._2, i._3)))
+ }
+
test("SPARK-8976 Wrong Result for CUBE #2") {
checkAnswer(sql(
"""
@@ -1464,6 +1532,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
).map(i => Row(i._1, i._2, i._3, i._4)))
}
+ test("SPARK-21055 replace grouping__id: Wrong Result for CUBE #2") {
+ checkAnswer(sql(
+ """
+ |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+ |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+ |WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin),
+ Seq(
+ (1, null, -3, 2),
+ (1, null, -1, 2),
+ (1, null, 3, 2),
+ (1, null, 4, 2),
+ (1, null, 5, 2),
+ (1, null, 6, 2),
+ (1, null, 12, 2),
+ (1, null, 14, 2),
+ (1, null, 15, 2),
+ (1, null, 22, 2)
+ ).map(i => Row(i._1, i._2, i._3, i._4)))
+ }
+
test("SPARK-8976 Wrong Result for GroupingSet") {
checkAnswer(sql(
"""
@@ -1485,6 +1574,27 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
).map(i => Row(i._1, i._2, i._3, i._4)))
}
+ test("SPARK-21055 replace grouping__id: Wrong Result for GroupingSet") {
+ checkAnswer(sql(
+ """
+ |SELECT count(*) AS cnt, key % 5 AS k1, key-5 AS k2, grouping__id AS k3
+ |FROM (SELECT key, key%2, key - 5 FROM src) t GROUP BY key%5, key-5
+ |GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin),
+ Seq(
+ (1, null, -3, 2),
+ (1, null, -1, 2),
+ (1, null, 3, 2),
+ (1, null, 4, 2),
+ (1, null, 5, 2),
+ (1, null, 6, 2),
+ (1, null, 12, 2),
+ (1, null, 14, 2),
+ (1, null, 15, 2),
+ (1, null, 22, 2)
+ ).map(i => Row(i._1, i._2, i._3, i._4)))
+ }
+
ignore("SPARK-10562: partition by column with mixed case name") {
withTable("tbl10562") {
val df = Seq(2012 -> "a").toDF("Year", "val")
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org