You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/07/18 23:15:49 UTC

[GitHub] [spark] skonto opened a new pull request #25196: [SPARK-28279][SQL][PYTHON][TESTS] Convert and port 'group-analytics.sql' into UDF test base

skonto opened a new pull request #25196: [SPARK-28279][SQL][PYTHON][TESTS] Convert and port 'group-analytics.sql' into UDF test base
URL: https://github.com/apache/spark/pull/25196
 
 
   ## What changes were proposed in this pull request?
   This PR adds some tests converted from group-analytics.sql to test UDFs. Please see contribution guide of this umbrella ticket - SPARK-27921.
   <details><summary>Diff comparing to 'group-analytics.sql'</summary>
   <p>
   
   ```diff
   diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out
   index 3439a05727..ef14152136 100644
   --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out
   +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out
   @@ -13,9 +13,9 @@ struct<>
    
    
    -- !query 1
   -SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH CUBE
   +SELECT a + b, b, udf(SUM(a - b)) FROM testData GROUP BY a + b, b WITH CUBE
    -- !query 1 schema
   -struct<(a + b):int,b:int,sum((a - b)):bigint>
   +struct<(a + b):int,b:int,CAST(udf(cast(sum(cast((a - b) as bigint)) as string)) AS BIGINT):bigint>
    -- !query 1 output
    2	1	0
    2	NULL	0
   @@ -33,9 +33,9 @@ NULL	NULL	3
    
    
    -- !query 2
   -SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE
   +SELECT a, udf(b), SUM(b) FROM testData GROUP BY a, b WITH CUBE
    -- !query 2 schema
   -struct<a:int,b:int,sum(b):bigint>
   +struct<a:int,CAST(udf(cast(b as string)) AS INT):int,sum(b):bigint>
    -- !query 2 output
    1	1	1
    1	2	2
   @@ -52,9 +52,9 @@ NULL	NULL	9
    
    
    -- !query 3
   -SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP
   +SELECT udf(a + b), b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP
    -- !query 3 schema
   -struct<(a + b):int,b:int,sum((a - b)):bigint>
   +struct<CAST(udf(cast((a + b) as string)) AS INT):int,b:int,sum((a - b)):bigint>
    -- !query 3 output
    2	1	0
    2	NULL	0
   @@ -70,9 +70,9 @@ NULL	NULL	3
    
    
    -- !query 4
   -SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP
   +SELECT a, b, udf(SUM(b)) FROM testData GROUP BY a, b WITH ROLLUP
    -- !query 4 schema
   -struct<a:int,b:int,sum(b):bigint>
   +struct<a:int,b:int,CAST(udf(cast(sum(cast(b as bigint)) as string)) AS BIGINT):bigint>
    -- !query 4 output
    1	1	1
    1	2	2
   @@ -97,7 +97,7 @@ struct<>
    
    
    -- !query 6
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year
   +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY udf(course), year
    -- !query 6 schema
    struct<course:string,year:int,sum(earnings):bigint>
    -- !query 6 output
   @@ -111,7 +111,7 @@ dotNET	2013	48000
    
    
    -- !query 7
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year
   +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, udf(year)
    -- !query 7 schema
    struct<course:string,year:int,sum(earnings):bigint>
    -- !query 7 output
   @@ -127,9 +127,9 @@ dotNET	2013	48000
    
    
    -- !query 8
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year)
   +SELECT course, udf(year), SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year)
    -- !query 8 schema
   -struct<course:string,year:int,sum(earnings):bigint>
   +struct<course:string,CAST(udf(cast(year as string)) AS INT):int,sum(earnings):bigint>
    -- !query 8 output
    Java	NULL	50000
    NULL	2012	35000
   @@ -138,26 +138,26 @@ dotNET	NULL	63000
    
    
    -- !query 9
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course)
   +SELECT course, year, udf(SUM(earnings)) FROM courseSales GROUP BY course, year GROUPING SETS(course)
    -- !query 9 schema
   -struct<course:string,year:int,sum(earnings):bigint>
   +struct<course:string,year:int,CAST(udf(cast(sum(cast(earnings as bigint)) as string)) AS BIGINT):bigint>
    -- !query 9 output
    Java	NULL	50000
    dotNET	NULL	63000
    
    
    -- !query 10
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year)
   +SELECT udf(course), year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year)
    -- !query 10 schema
   -struct<course:string,year:int,sum(earnings):bigint>
   +struct<CAST(udf(cast(course as string)) AS STRING):string,year:int,sum(earnings):bigint>
    -- !query 10 output
    NULL	2012	35000
    NULL	2013	78000
    
    
    -- !query 11
   -SELECT course, SUM(earnings) AS sum FROM courseSales
   -GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum
   +SELECT course, udf(SUM(earnings)) AS sum FROM courseSales
   +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, udf(sum)
    -- !query 11 schema
    struct<course:string,sum:bigint>
    -- !query 11 output
   @@ -173,7 +173,7 @@ dotNET	63000
    
    -- !query 12
    SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales
   -GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum
   +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY udf(course), sum
    -- !query 12 schema
    struct<course:string,sum:bigint,grouping_id(course, earnings):int>
    -- !query 12 output
   @@ -188,10 +188,10 @@ dotNET	63000	1
    
    
    -- !query 13
   -SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales
   +SELECT udf(course), udf(year), GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales
    GROUP BY CUBE(course, year)
    -- !query 13 schema
   -struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course, year):int>
   +struct<CAST(udf(cast(course as string)) AS STRING):string,CAST(udf(cast(year as string)) AS INT):int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course, year):int>
    -- !query 13 output
    Java	2012	0	0	0
    Java	2013	0	0	0
   @@ -205,7 +205,7 @@ dotNET	NULL	0	1	1
    
    
    -- !query 14
   -SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year
   +SELECT course, udf(year), GROUPING(course) FROM courseSales GROUP BY course, year
    -- !query 14 schema
    struct<>
    -- !query 14 output
   @@ -214,7 +214,7 @@ grouping() can only be used with GroupingSets/Cube/Rollup;
    
    
    -- !query 15
   -SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year
   +SELECT course, udf(year), GROUPING_ID(course, year) FROM courseSales GROUP BY course, year
    -- !query 15 schema
    struct<>
    -- !query 15 output
   @@ -240,7 +240,7 @@ NULL	NULL	3
    
    -- !query 17
    SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
   -HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year
   +HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, udf(year)
    -- !query 17 schema
    struct<course:string,year:int>
    -- !query 17 output
   @@ -250,7 +250,7 @@ dotNET	NULL
    
    
    -- !query 18
   -SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0
   +SELECT course, udf(year) FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0
    -- !query 18 schema
    struct<>
    -- !query 18 output
   @@ -259,7 +259,7 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
    
    
    -- !query 19
   -SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0
   +SELECT course, udf(udf(year)) FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0
    -- !query 19 schema
    struct<>
    -- !query 19 output
   @@ -268,9 +268,9 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
    
    
    -- !query 20
   -SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0
   +SELECT udf(course), year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0
    -- !query 20 schema
   -struct<course:string,year:int>
   +struct<CAST(udf(cast(course as string)) AS STRING):string,year:int>
    -- !query 20 output
    Java	NULL
    NULL	2012
   @@ -281,7 +281,7 @@ dotNET	NULL
    
    -- !query 21
    SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year)
   -ORDER BY GROUPING(course), GROUPING(year), course, year
   +ORDER BY GROUPING(course), GROUPING(year), course, udf(year)
    -- !query 21 schema
    struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint>
    -- !query 21 output
   @@ -298,7 +298,7 @@ NULL	NULL	1	1
    
    -- !query 22
    SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year)
   -ORDER BY GROUPING(course), GROUPING(year), course, year
   +ORDER BY GROUPING(course), GROUPING(year), course, udf(year)
    -- !query 22 schema
    struct<course:string,year:int,grouping_id(course, year):int>
    -- !query 22 output
   @@ -314,7 +314,7 @@ NULL	NULL	3
    
    
    -- !query 23
   -SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course)
   +SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING(course)
    -- !query 23 schema
    struct<>
    -- !query 23 output
   @@ -323,7 +323,7 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
    
    
    -- !query 24
   -SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course)
   +SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING_ID(course)
    -- !query 24 schema
    struct<>
    -- !query 24 output
   @@ -332,7 +332,7 @@ grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup;
    
    
    -- !query 25
   -SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year
   +SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, udf(course), year
    -- !query 25 schema
    struct<course:string,year:int>
    -- !query 25 output
   @@ -348,7 +348,7 @@ NULL	NULL
    
    
    -- !query 26
   -SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2)
   +SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2)
    -- !query 26 schema
    struct<k1:int,k2:int,sum((a - b)):bigint>
    -- !query 26 output
   @@ -368,7 +368,7 @@ NULL	NULL	3
    
    
    -- !query 27
   -SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
   +SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
    -- !query 27 schema
    struct<k:int,b:int,sum((a - b)):bigint>
    -- !query 27 output
   @@ -386,9 +386,9 @@ NULL	NULL	3
    
    
    -- !query 28
   -SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k)
   +SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k)
    -- !query 28 schema
   -struct<(a + b):int,k:int,sum((a - b)):bigint>
   +struct<CAST(udf(cast((a + b) as string)) AS INT):int,k:int,sum((a - b)):bigint>
    -- !query 28 output
    NULL	1	3
    NULL	2	0
   
   ```
   ## How was this patch tested?
   
   Tested as guided in SPARK-27921.
   Verified pandas & pyarrow versions:
   
   Python 3.6.8 (default, Jan 14 2019, 11:02:34) 
   [GCC 8.0.1 20180414 (experimental) [trunk revision 259383]] on linux
   Type "help", "copyright", "credits" or "license" for more information.
   >>> import pandas
   >>> import pyarrow
   >>> pyarrow.__version__
   '0.14.0'
   >>> pandas.__version__
   '0.24.2'
   From the sql output it seems that sql statements are evaluated correctly given that udf returns a string and may change results as Null will be returned as None and will be counted in returned values.
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org