You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2023/02/01 09:36:35 UTC

[spark] branch master updated: [SPARK-42259][SQL] ResolveGroupingAnalytics should take care of Python UDAF

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 1219c849237 [SPARK-42259][SQL] ResolveGroupingAnalytics should take care of Python UDAF
1219c849237 is described below

commit 1219c8492376e038894111cd5d922229260482e7
Author: Wenchen Fan <we...@databricks.com>
AuthorDate: Wed Feb 1 17:36:14 2023 +0800

    [SPARK-42259][SQL] ResolveGroupingAnalytics should take care of Python UDAF
    
    ### What changes were proposed in this pull request?
    
    This is a long-standing correctness issue with Python UDAF and grouping analytics. The rule `ResolveGroupingAnalytics` should take care of Python UDAF when matching aggregate expressions.
    
    ### Why are the changes needed?
    
    bug fix
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the query result was wrong before
    
    ### How was this patch tested?
    
    existing tests
    
    Closes #39824 from cloud-fan/python.
    
    Authored-by: Wenchen Fan <we...@databricks.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala     |  2 +-
 .../results/udaf/udaf-group-analytics.sql.out      | 58 +++++++++++-----------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8f6028b0993..ce273f01c7a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -622,7 +622,7 @@ class Analyzer(override val catalogManager: CatalogManager)
         // AggregateExpression should be computed on the unmodified value of its argument
         // expressions, so we should not replace any references to grouping expression
         // inside it.
-        case e: AggregateExpression =>
+        case e if AggregateExpression.isAggregate(e) =>
           aggsBuffer += e
           e
         case e if isPartOfAggregation(e) => e
diff --git a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
index b8c94b19d81..f0be6f43642 100644
--- a/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udaf/udaf-group-analytics.sql.out
@@ -15,18 +15,18 @@ SELECT a + b, b, udaf(a - b) FROM testData GROUP BY a + b, b WITH CUBE
 struct<(a + b):int,b:int,udaf((a - b)):int>
 -- !query output
 2	1	1
-2	NULL	0
+2	NULL	1
 3	1	1
 3	2	1
-3	NULL	0
+3	NULL	2
 4	1	1
 4	2	1
-4	NULL	0
+4	NULL	2
 5	2	1
-5	NULL	0
+5	NULL	1
 NULL	1	3
 NULL	2	3
-NULL	NULL	0
+NULL	NULL	6
 
 
 -- !query
@@ -36,16 +36,16 @@ struct<a:int,b:int,udaf(b):int>
 -- !query output
 1	1	1
 1	2	1
-1	NULL	0
+1	NULL	2
 2	1	1
 2	2	1
-2	NULL	0
+2	NULL	2
 3	1	1
 3	2	1
-3	NULL	0
+3	NULL	2
 NULL	1	3
 NULL	2	3
-NULL	NULL	0
+NULL	NULL	6
 
 
 -- !query
@@ -54,16 +54,16 @@ SELECT a + b, b, udaf(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP
 struct<(a + b):int,b:int,udaf((a - b)):int>
 -- !query output
 2	1	1
-2	NULL	0
+2	NULL	1
 3	1	1
 3	2	1
-3	NULL	0
+3	NULL	2
 4	1	1
 4	2	1
-4	NULL	0
+4	NULL	2
 5	2	1
-5	NULL	0
-NULL	NULL	0
+5	NULL	1
+NULL	NULL	6
 
 
 -- !query
@@ -73,14 +73,14 @@ struct<a:int,b:int,udaf(b):int>
 -- !query output
 1	1	1
 1	2	1
-1	NULL	0
+1	NULL	2
 2	1	1
 2	2	1
-2	NULL	0
+2	NULL	2
 3	1	1
 3	2	1
-3	NULL	0
-NULL	NULL	0
+3	NULL	2
+NULL	NULL	6
 
 
 -- !query
@@ -416,14 +416,14 @@ GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER
 -- !query schema
 struct<course:string,sum:int>
 -- !query output
-NULL	0
-Java	0
+NULL	5
 Java	1
 Java	1
-dotNET	0
+Java	2
 dotNET	1
 dotNET	1
 dotNET	1
+dotNET	3
 
 
 -- !query
@@ -432,14 +432,14 @@ GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER
 -- !query schema
 struct<course:string,sum:int,grouping_id(course, earnings):bigint>
 -- !query output
-NULL	0	3
-Java	0	1
+NULL	5	3
 Java	1	0
 Java	1	0
-dotNET	0	1
+Java	2	1
 dotNET	1	0
 dotNET	1	0
 dotNET	1	0
+dotNET	3	1
 
 
 -- !query
@@ -468,16 +468,16 @@ SELECT a + b AS k, b, udaf(a - b) FROM testData GROUP BY ROLLUP(k, b)
 struct<k:int,b:int,udaf((a - b)):int>
 -- !query output
 2	1	1
-2	NULL	0
+2	NULL	1
 3	1	1
 3	2	1
-3	NULL	0
+3	NULL	2
 4	1	1
 4	2	1
-4	NULL	0
+4	NULL	2
 5	2	1
-5	NULL	0
-NULL	NULL	0
+5	NULL	1
+NULL	NULL	6
 
 
 -- !query


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org