You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/08/26 07:26:06 UTC

[spark] branch branch-3.2 updated: [SPARK-40218][SQL] GROUPING SETS should preserve the grouping columns

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new de649f0759a [SPARK-40218][SQL] GROUPING SETS should preserve the grouping columns
de649f0759a is described below

commit de649f0759ab84d2baf9f51d1fd53d35a49ef9a8
Author: Wenchen Fan <we...@databricks.com>
AuthorDate: Fri Aug 26 15:24:01 2022 +0800

    [SPARK-40218][SQL] GROUPING SETS should preserve the grouping columns
    
    ### What changes were proposed in this pull request?
    
    This PR fixes a bug caused by https://github.com/apache/spark/pull/32022 . Although we deprecate `GROUP BY ... GROUPING SETS ...`, it should still work if it worked before.
    
    https://github.com/apache/spark/pull/32022 made a mistake that it didn't preserve the order of user-specified group by columns. Usually it's not a problem, as `GROUP BY a, b` is no different from `GROUP BY b, a`. However, the `grouping_id(...)` function requires the input to be exactly the same with the group by columns. This PR fixes the problem by preserve the order of user-specified group by columns.
    
    ### Why are the changes needed?
    
    bug fix
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, now a query that worked before 3.2 can work again.
    
    ### How was this patch tested?
    
    new test
    
    Closes #37655 from cloud-fan/grouping.
    
    Authored-by: Wenchen Fan <we...@databricks.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 1ed592ef28abdb14aa1d8c8a129d6ac3084ffb0c)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/sql/catalyst/expressions/grouping.scala  |  9 +++++++--
 sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql |  3 +++
 .../src/test/resources/sql-tests/results/grouping_set.sql.out | 11 +++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
index 8ce0e57b691..22e25b31f2e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
@@ -151,10 +151,15 @@ case class GroupingSets(
   override def selectedGroupByExprs: Seq[Seq[Expression]] = groupingSets
   // Includes the `userGivenGroupByExprs` in the children, which will be included in the final
   // GROUP BY expressions, so that `SELECT c ... GROUP BY (a, b, c) GROUPING SETS (a, b)` works.
-  override def children: Seq[Expression] = flatGroupingSets ++ userGivenGroupByExprs
+  // Note that, we must put `userGivenGroupByExprs` at the beginning, to preserve the order of
+  // grouping columns specified by users. For example, GROUP BY (a, b) GROUPING SETS (b, a), the
+  // final grouping columns should be (a, b).
+  override def children: Seq[Expression] = userGivenGroupByExprs ++ flatGroupingSets
   override protected def withNewChildrenInternal(
       newChildren: IndexedSeq[Expression]): GroupingSets =
-    super.legacyWithNewChildren(newChildren).asInstanceOf[GroupingSets]
+    copy(
+      userGivenGroupByExprs = newChildren.take(userGivenGroupByExprs.length),
+      flatGroupingSets = newChildren.drop(userGivenGroupByExprs.length))
 }
 
 object GroupingSets {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
index d30914fdd92..4d516bdda7b 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
@@ -57,3 +57,6 @@ SELECT k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUP
 SELECT grouping__id, k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1));
 
 SELECT grouping(k1), k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1));
+
+-- grouping_id function
+SELECT grouping_id(k1, k2), avg(v) from (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY k1, k2 GROUPING SETS ((k2, k1), k1);
diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
index 6af8e7048c8..2d53ba02c21 100644
--- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
@@ -205,3 +205,14 @@ struct<grouping(k1):tinyint,k1:int,k2:int,avg(v):double>
 0	2	2	2.0
 0	2	2	2.0
 0	2	NULL	2.0
+
+
+-- !query
+SELECT grouping_id(k1, k2), avg(v) from (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY k1, k2 GROUPING SETS ((k2, k1), k1)
+-- !query schema
+struct<grouping_id(k1, k2):bigint,avg(v):double>
+-- !query output
+0	1.0
+0	2.0
+1	1.0
+1	2.0


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org