You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yh...@apache.org on 2015/07/16 08:35:31 UTC
spark git commit: [SPARK-8972] [SQL] Incorrect result for rollup
Repository: spark
Updated Branches:
refs/heads/master ba3309684 -> e27212317
[SPARK-8972] [SQL] Incorrect result for rollup
We don't support the complex expression keys in the rollup/cube, and we even will not report it if we have the complex group by keys, that will cause very confusing/incorrect result.
e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP`
This PR adds an additional project during the analyzing for the complex GROUP BY keys, and that projection will be the child of `Expand`, so to `Expand`, the GROUP BY KEY are always the simple key(attribute names).
Author: Cheng Hao <ha...@intel.com>
Closes #7343 from chenghao-intel/expand and squashes the following commits:
1ebbb59 [Cheng Hao] update the comment
827873f [Cheng Hao] update as feedback
34def69 [Cheng Hao] Add more unit test and comments
c695760 [Cheng Hao] fix bug of incorrect result for rollup
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2721231
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2721231
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2721231
Branch: refs/heads/master
Commit: e27212317c7341852c52d9a85137b8f94cb0d935
Parents: ba33096
Author: Cheng Hao <ha...@intel.com>
Authored: Wed Jul 15 23:35:27 2015 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Wed Jul 15 23:35:27 2015 -0700
----------------------------------------------------------------------
.../spark/sql/catalyst/analysis/Analyzer.scala | 42 +++++++++++++--
...r CUBE #1-0-63b61fb3f0e74226001ad279be440864 | 6 +++
...r CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 | 10 ++++
...oupingSet-0-8c14c24670a4b06c440346277ce9cf1c | 10 ++++
...Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a | 6 +++
...Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 | 10 ++++
...Rollup #3-0-9257085d123728730be96b6d9fbb84ce | 10 ++++
.../sql/hive/execution/HiveQuerySuite.scala | 54 ++++++++++++++++++++
8 files changed, 145 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 891408e..df8e7f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -194,16 +194,52 @@ class Analyzer(
}
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+ case a if !a.childrenResolved => a // be sure all of the children are resolved.
case a: Cube =>
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
case a: Rollup =>
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
case x: GroupingSets =>
val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
+ // We will insert another Projection if the GROUP BY keys contains the
+ // non-attribute expressions. And the top operators can references those
+ // expressions by its alias.
+ // e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==>
+ // SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a
+
+ // find all of the non-attribute expressions in the GROUP BY keys
+ val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]()
+
+ // The pair of (the original GROUP BY key, associated attribute)
+ val groupByExprPairs = x.groupByExprs.map(_ match {
+ case e: NamedExpression => (e, e.toAttribute)
+ case other => {
+ val alias = Alias(other, other.toString)()
+ nonAttributeGroupByExpressions += alias // add the non-attributes expression alias
+ (other, alias.toAttribute)
+ }
+ })
+
+ // substitute the non-attribute expressions for aggregations.
+ val aggregation = x.aggregations.map(expr => expr.transformDown {
+ case e => groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e)
+ }.asInstanceOf[NamedExpression])
+
+ // substitute the group by expressions.
+ val newGroupByExprs = groupByExprPairs.map(_._2)
+
+ val child = if (nonAttributeGroupByExpressions.length > 0) {
+ // insert additional projection if contains the
+ // non-attribute expressions in the GROUP BY keys
+ Project(x.child.output ++ nonAttributeGroupByExpressions, x.child)
+ } else {
+ x.child
+ }
+
Aggregate(
- x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
- x.aggregations,
- Expand(x.bitmasks, x.groupByExprs, gid, x.child))
+ newGroupByExprs :+ VirtualColumn.groupingIdAttribute,
+ aggregation,
+ Expand(x.bitmasks, newGroupByExprs, gid, child))
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864
new file mode 100644
index 0000000..dac1b84
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #1-0-63b61fb3f0e74226001ad279be440864
@@ -0,0 +1,6 @@
+500 NULL 0
+91 0 1
+84 1 1
+105 2 1
+113 3 1
+107 4 1
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896
new file mode 100644
index 0000000..c7cb747
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for CUBE #2-0-7a511f02a16f0af4f810b1666cfcd896
@@ -0,0 +1,10 @@
+1 NULL -3 2
+1 NULL -1 2
+1 NULL 3 2
+1 NULL 4 2
+1 NULL 5 2
+1 NULL 6 2
+1 NULL 12 2
+1 NULL 14 2
+1 NULL 15 2
+1 NULL 22 2
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
new file mode 100644
index 0000000..c7cb747
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for GroupingSet-0-8c14c24670a4b06c440346277ce9cf1c
@@ -0,0 +1,10 @@
+1 NULL -3 2
+1 NULL -1 2
+1 NULL 3 2
+1 NULL 4 2
+1 NULL 5 2
+1 NULL 6 2
+1 NULL 12 2
+1 NULL 14 2
+1 NULL 15 2
+1 NULL 22 2
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a
new file mode 100644
index 0000000..dac1b84
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #1-0-a78e3dbf242f240249e36b3d3fd0926a
@@ -0,0 +1,6 @@
+500 NULL 0
+91 0 1
+84 1 1
+105 2 1
+113 3 1
+107 4 1
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89 b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89
new file mode 100644
index 0000000..1eea4a9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #2-0-bf180c9d1a18f61b9d9f31bb0115cf89
@@ -0,0 +1,10 @@
+1 0 5 3
+1 0 15 3
+1 0 25 3
+1 0 60 3
+1 0 75 3
+1 0 80 3
+1 0 100 3
+1 0 140 3
+1 0 145 3
+1 0 150 3
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce
new file mode 100644
index 0000000..1eea4a9
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/SPARK-8976 Wrong Result for Rollup #3-0-9257085d123728730be96b6d9fbb84ce
@@ -0,0 +1,10 @@
+1 0 5 3
+1 0 15 3
+1 0 25 3
+1 0 60 3
+1 0 75 3
+1 0 80 3
+1 0 100 3
+1 0 140 3
+1 0 145 3
+1 0 150 3
http://git-wip-us.apache.org/repos/asf/spark/blob/e2721231/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 991da2f..11a843b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
}
}
+ createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
+ """
+ SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP
+ """.stripMargin)
+
+ createQueryTest("SPARK-8976 Wrong Result for Rollup #2",
+ """
+ SELECT
+ count(*) AS cnt,
+ key % 5 as k1,
+ key-5 as k2,
+ GROUPING__ID as k3
+ FROM src group by key%5, key-5
+ WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin)
+
+ createQueryTest("SPARK-8976 Wrong Result for Rollup #3",
+ """
+ SELECT
+ count(*) AS cnt,
+ key % 5 as k1,
+ key-5 as k2,
+ GROUPING__ID as k3
+ FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+ WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin)
+
+ createQueryTest("SPARK-8976 Wrong Result for CUBE #1",
+ """
+ SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH CUBE
+ """.stripMargin)
+
+ createQueryTest("SPARK-8976 Wrong Result for CUBE #2",
+ """
+ SELECT
+ count(*) AS cnt,
+ key % 5 as k1,
+ key-5 as k2,
+ GROUPING__ID as k3
+ FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+ WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin)
+
+ createQueryTest("SPARK-8976 Wrong Result for GroupingSet",
+ """
+ SELECT
+ count(*) AS cnt,
+ key % 5 as k1,
+ key-5 as k2,
+ GROUPING__ID as k3
+ FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
+ GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
+ """.stripMargin)
+
createQueryTest("insert table with generator with column name",
"""
| CREATE TABLE gen_tmp (key Int);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org