You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2019/09/05 09:35:04 UTC

[spark] branch master updated: [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new be04c97  [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base
be04c97 is described below

commit be04c972623df0c44d92ba55a9efadf59a27089e
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Thu Sep 5 18:34:44 2019 +0900

    [SPARK-28971][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part4.sql' into UDF test base
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to port `pgSQL/aggregates_part4.sql` into UDF test base.
    
    <details><summary>Diff comparing to 'pgSQL/aggregates_part3.sql'</summary>
    <p>
    
    ```diff
    ```
    
    </p>
    </details>
    
    ### Why are the changes needed?
    
    To improve test coverage in UDFs.
    
    ### Does this PR introduce any user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Manually tested via:
    
    ```bash
     build/sbt "sql/test-only *SQLQueryTestSuite -- -z udf/pgSQL/udf-aggregates_part4.sql"
    ```
    
    as guided in https://issues.apache.org/jira/browse/SPARK-27921
    
    Closes #25677 from HyukjinKwon/SPARK-28971.
    
    Authored-by: HyukjinKwon <gu...@apache.org>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 .../inputs/udf/pgSQL/udf-aggregates_part4.sql      | 421 +++++++++++++++++++++
 .../results/udf/pgSQL/udf-aggregates_part4.sql.out |   5 +
 2 files changed, 426 insertions(+)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql
new file mode 100644
index 0000000..7c77773
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql
@@ -0,0 +1,421 @@
+--
+-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+--
+--
+-- AGGREGATES [Part 4]
+-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L607-L997
+
+-- This test file was converted from pgSQL/aggregates_part4.sql.
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- ordered-set aggregates
+
+-- select p, percentile_cont(p) within group (order by x::float8)
+-- from generate_series(1,5) x,
+--      (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p)
+-- group by p order by p;
+
+-- select p, percentile_cont(p order by p) within group (order by x)  -- error
+-- from generate_series(1,5) x,
+--      (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p)
+-- group by p order by p;
+
+-- select p, sum() within group (order by x::float8)  -- error
+-- from generate_series(1,5) x,
+--      (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p)
+-- group by p order by p;
+
+-- select p, percentile_cont(p,p)  -- error
+-- from generate_series(1,5) x,
+--      (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1)) v(p)
+-- group by p order by p;
+
+-- select percentile_cont(0.5) within group (order by b) from aggtest;
+-- select percentile_cont(0.5) within group (order by b), sum(b) from aggtest;
+-- select percentile_cont(0.5) within group (order by thousand) from tenk1;
+-- select percentile_disc(0.5) within group (order by thousand) from tenk1;
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- select rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+-- select cume_dist(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+-- select percent_rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4),(5)) v(x);
+-- select dense_rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- select percentile_disc(array[0,0.1,0.25,0.5,0.75,0.9,1]) within group (order by thousand)
+-- from tenk1;
+-- select percentile_cont(array[0,0.25,0.5,0.75,1]) within group (order by thousand)
+-- from tenk1;
+-- select percentile_disc(array[[null,1,0.5],[0.75,0.25,null]]) within group (order by thousand)
+-- from tenk1;
+-- select percentile_cont(array[0,1,0.25,0.75,0.5,1,0.3,0.32,0.35,0.38,0.4]) within group (order by x)
+-- from generate_series(1,6) x;
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- [SPARK-28382] Array Functions: unnest
+-- select ten, mode() within group (order by string4) from tenk1 group by ten;
+
+-- select percentile_disc(array[0.25,0.5,0.75]) within group (order by x)
+-- from unnest('{fred,jim,fred,jack,jill,fred,jill,jim,jim,sheila,jim,sheila}'::text[]) u(x);
+
+-- [SPARK-28669] System Information Functions
+-- check collation propagates up in suitable cases:
+-- select pg_collation_for(percentile_disc(1) within group (order by x collate "POSIX"))
+--   from (values ('fred'),('jim')) v(x);
+
+-- test_rank and test_percentile_disc function created by create_aggregate.sql
+-- ordered-set aggs created with CREATE AGGREGATE
+-- select test_rank(3) within group (order by x)
+-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
+-- select test_percentile_disc(0.5) within group (order by thousand) from tenk1;
+
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- ordered-set aggs can't use ungrouped vars in direct args:
+-- select rank(x) within group (order by x) from generate_series(1,5) x;
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- outer-level agg can't use a grouped arg of a lower level, either:
+-- select array(select percentile_disc(a) within group (order by x)
+--                from (values (0.3),(0.7)) v(a) group by a)
+--   from generate_series(1,5) g(x);
+
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- agg in the direct args is a grouping violation, too:
+--select rank(sum(x)) within group (order by x) from generate_series(1,5) x;
+
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- hypothetical-set type unification and argument-count failures:
+-- select rank(3) within group (order by x) from (values ('fred'),('jim')) v(x);
+-- select rank(3) within group (order by stringu1,stringu2) from tenk1;
+-- select rank('fred') within group (order by x) from generate_series(1,5) x;
+-- select rank('adam'::text collate "C") within group (order by x collate "POSIX")
+--   from (values ('fred'),('jim')) v(x);
+-- hypothetical-set type unification successes:
+-- select rank('adam'::varchar) within group (order by x) from (values ('fred'),('jim')) v(x);
+-- select rank('3') within group (order by x) from generate_series(1,5) x;
+
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- divide by zero check
+-- select percent_rank(0) within group (order by x) from generate_series(1,0) x;
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- deparse and multiple features:
+-- create view aggordview1 as
+-- select ten,
+--        percentile_disc(0.5) within group (order by thousand) as p50,
+--        percentile_disc(0.5) within group (order by thousand) filter (where hundred=1) as px,
+--        rank(5,'AZZZZ',50) within group (order by hundred, string4 desc, hundred)
+--   from tenk1
+--  group by ten order by ten;
+
+-- select pg_get_viewdef('aggordview1');
+-- select * from aggordview1 order by ten;
+-- drop view aggordview1;
+
+-- least_agg created by create_aggregate.sql
+-- variadic aggregates
+-- select least_agg(q1,q2) from int8_tbl;
+-- select least_agg(variadic array[q1,q2]) from int8_tbl;
+
+
+-- Skip these tests because we do not support create type
+-- test aggregates with common transition functions share the same states
+-- begin work;
+
+-- create type avg_state as (total bigint, count bigint);
+
+-- create or replace function avg_transfn(state avg_state, n int) returns avg_state as
+-- $$
+-- declare new_state avg_state;
+-- begin
+-- 	raise notice 'avg_transfn called with %', n;
+-- 	if state is null then
+-- 		if n is not null then
+-- 			new_state.total := n;
+-- 			new_state.count := 1;
+-- 			return new_state;
+-- 		end if;
+-- 		return null;
+-- 	elsif n is not null then
+-- 		state.total := state.total + n;
+-- 		state.count := state.count + 1;
+-- 		return state;
+-- 	end if;
+--
+-- 	return null;
+-- end
+-- $$ language plpgsql;
+
+-- create function avg_finalfn(state avg_state) returns int4 as
+-- $$
+-- begin
+-- 	if state is null then
+-- 		return NULL;
+-- 	else
+-- 		return state.total / state.count;
+-- 	end if;
+-- end
+-- $$ language plpgsql;
+
+-- create function sum_finalfn(state avg_state) returns int4 as
+-- $$
+-- begin
+-- 	if state is null then
+-- 		return NULL;
+-- 	else
+-- 		return state.total;
+-- 	end if;
+-- end
+-- $$ language plpgsql;
+
+-- create aggregate my_avg(int4)
+-- (
+--    stype = avg_state,
+--    sfunc = avg_transfn,
+--    finalfunc = avg_finalfn
+-- );
+--
+-- create aggregate my_sum(int4)
+-- (
+--    stype = avg_state,
+--    sfunc = avg_transfn,
+--    finalfunc = sum_finalfn
+-- );
+
+-- aggregate state should be shared as aggs are the same.
+-- select my_avg(one),my_avg(one) from (values(1),(3)) t(one);
+
+-- aggregate state should be shared as transfn is the same for both aggs.
+-- select my_avg(one),my_sum(one) from (values(1),(3)) t(one);
+
+-- same as previous one, but with DISTINCT, which requires sorting the input.
+-- select my_avg(distinct one),my_sum(distinct one) from (values(1),(3),(1)) t(one);
+
+-- shouldn't share states due to the distinctness not matching.
+-- select my_avg(distinct one),my_sum(one) from (values(1),(3)) t(one);
+
+-- shouldn't share states due to the filter clause not matching.
+-- select my_avg(one) filter (where one > 1),my_sum(one) from (values(1),(3)) t(one);
+
+-- this should not share the state due to different input columns.
+-- select my_avg(one),my_sum(two) from (values(1,2),(3,4)) t(one,two);
+
+-- [SPARK-27980] Ordered-Set Aggregate Functions
+-- exercise cases where OSAs share state
+-- select
+--   percentile_cont(0.5) within group (order by a),
+--   percentile_disc(0.5) within group (order by a)
+-- from (values(1::float8),(3),(5),(7)) t(a);
+
+-- select
+--   percentile_cont(0.25) within group (order by a),
+--   percentile_disc(0.5) within group (order by a)
+-- from (values(1::float8),(3),(5),(7)) t(a);
+
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- these can't share state currently
+-- select
+--   rank(4) within group (order by a),
+--   dense_rank(4) within group (order by a)
+-- from (values(1),(3),(5),(7)) t(a);
+
+-- test that aggs with the same sfunc and initcond share the same agg state
+-- create aggregate my_sum_init(int4)
+-- (
+--    stype = avg_state,
+--    sfunc = avg_transfn,
+--    finalfunc = sum_finalfn,
+--    initcond = '(10,0)'
+-- );
+
+-- create aggregate my_avg_init(int4)
+-- (
+--    stype = avg_state,
+--    sfunc = avg_transfn,
+--    finalfunc = avg_finalfn,
+--    initcond = '(10,0)'
+-- );
+
+-- create aggregate my_avg_init2(int4)
+-- (
+--    stype = avg_state,
+--    sfunc = avg_transfn,
+--    finalfunc = avg_finalfn,
+--    initcond = '(4,0)'
+-- );
+
+-- state should be shared if INITCONDs are matching
+-- select my_sum_init(one),my_avg_init(one) from (values(1),(3)) t(one);
+
+-- Varying INITCONDs should cause the states not to be shared.
+-- select my_sum_init(one),my_avg_init2(one) from (values(1),(3)) t(one);
+
+-- rollback;
+
+-- test aggregate state sharing to ensure it works if one aggregate has a
+-- finalfn and the other one has none.
+-- begin work;
+
+-- create or replace function sum_transfn(state int4, n int4) returns int4 as
+-- $$
+-- declare new_state int4;
+-- begin
+-- 	raise notice 'sum_transfn called with %', n;
+-- 	if state is null then
+-- 		if n is not null then
+-- 			new_state := n;
+-- 			return new_state;
+-- 		end if;
+-- 		return null;
+-- 	elsif n is not null then
+-- 		state := state + n;
+-- 		return state;
+-- 	end if;
+--
+-- 	return null;
+-- end
+-- $$ language plpgsql;
+
+-- create function halfsum_finalfn(state int4) returns int4 as
+-- $$
+-- begin
+-- 	if state is null then
+-- 		return NULL;
+-- 	else
+-- 		return state / 2;
+-- 	end if;
+-- end
+-- $$ language plpgsql;
+
+-- create aggregate my_sum(int4)
+-- (
+--    stype = int4,
+--    sfunc = sum_transfn
+-- );
+
+-- create aggregate my_half_sum(int4)
+-- (
+--    stype = int4,
+--    sfunc = sum_transfn,
+--    finalfunc = halfsum_finalfn
+-- );
+
+-- Agg state should be shared even though my_sum has no finalfn
+-- select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one);
+
+-- rollback;
+
+
+-- test that the aggregate transition logic correctly handles
+-- transition / combine functions returning NULL
+
+-- First test the case of a normal transition function returning NULL
+-- BEGIN;
+-- CREATE FUNCTION balkifnull(int8, int4)
+-- RETURNS int8
+-- STRICT
+-- LANGUAGE plpgsql AS $$
+-- BEGIN
+--     IF $1 IS NULL THEN
+--        RAISE 'erroneously called with NULL argument';
+--     END IF;
+--     RETURN NULL;
+-- END$$;
+
+-- CREATE AGGREGATE balk(int4)
+-- (
+--     SFUNC = balkifnull(int8, int4),
+--     STYPE = int8,
+--     PARALLEL = SAFE,
+--     INITCOND = '0'
+-- );
+
+-- SELECT balk(hundred) FROM tenk1;
+
+-- ROLLBACK;
+
+-- Secondly test the case of a parallel aggregate combiner function
+-- returning NULL. For that use normal transition function, but a
+-- combiner function returning NULL.
+-- BEGIN ISOLATION LEVEL REPEATABLE READ;
+-- CREATE FUNCTION balkifnull(int8, int8)
+-- RETURNS int8
+-- PARALLEL SAFE
+-- STRICT
+-- LANGUAGE plpgsql AS $$
+-- BEGIN
+--     IF $1 IS NULL THEN
+--        RAISE 'erroneously called with NULL argument';
+--     END IF;
+--     RETURN NULL;
+-- END$$;
+
+-- CREATE AGGREGATE balk(int4)
+-- (
+--     SFUNC = int4_sum(int8, int4),
+--     STYPE = int8,
+--     COMBINEFUNC = balkifnull(int8, int8),
+--     PARALLEL = SAFE,
+--     INITCOND = '0'
+-- );
+
+-- force use of parallelism
+-- ALTER TABLE tenk1 set (parallel_workers = 4);
+-- SET LOCAL parallel_setup_cost=0;
+-- SET LOCAL max_parallel_workers_per_gather=4;
+
+-- EXPLAIN (COSTS OFF) SELECT balk(hundred) FROM tenk1;
+-- SELECT balk(hundred) FROM tenk1;
+
+-- ROLLBACK;
+
+-- test coverage for aggregate combine/serial/deserial functions
+-- BEGIN ISOLATION LEVEL REPEATABLE READ;
+
+-- SET parallel_setup_cost = 0;
+-- SET parallel_tuple_cost = 0;
+-- SET min_parallel_table_scan_size = 0;
+-- SET max_parallel_workers_per_gather = 4;
+-- SET enable_indexonlyscan = off;
+
+-- [SPARK-28663] Aggregate Functions for Statistics
+-- variance(int4) covers numeric_poly_combine
+-- sum(int8) covers int8_avg_combine
+-- regr_count(float8, float8) covers int8inc_float8_float8 and aggregates with > 1 arg
+-- EXPLAIN (COSTS OFF, VERBOSE)
+--   SELECT variance(unique1::int4), sum(unique1::int8), regr_count(unique1::float8, unique1::float8) FROM tenk1;
+
+-- SELECT variance(unique1::int4), sum(unique1::int8), regr_count(unique1::float8, unique1::float8) FROM tenk1;
+
+-- ROLLBACK;
+
+-- [SPARK-28661] Hypothetical-Set Aggregate Functions
+-- test coverage for dense_rank
+-- SELECT dense_rank(x) WITHIN GROUP (ORDER BY x) FROM (VALUES (1),(1),(2),(2),(3),(3)) v(x) GROUP BY (x) ORDER BY 1;
+
+
+-- [SPARK-28664] ORDER BY in aggregate function
+-- Ensure that the STRICT checks for aggregates does not take NULLness
+-- of ORDER BY columns into account. See bug report around
+-- 2a505161-2727-2473-7c46-591ed108ac52@email.cz
+-- SELECT min(x ORDER BY y) FROM (VALUES(1, NULL)) AS d(x,y);
+-- SELECT min(x ORDER BY y) FROM (VALUES(1, 2)) AS d(x,y);
+
+-- [SPARK-28382] Array Functions: unnest
+-- check collation-sensitive matching between grouping expressions
+-- select v||'a', case v||'a' when 'aa' then 1 else 0 end, count(*)
+--   from unnest(array['a','b']) u(v)
+--  group by v||'a' order by 1;
+-- select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
+--   from unnest(array['a','b']) u(v)
+--  group by v||'a' order by 1;
+
+-- Make sure that generation of HashAggregate for uniqification purposes
+-- does not lead to array overflow due to unexpected duplicate hash keys
+-- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
+-- explain (costs off)
+--   select 1 from tenk1
+--    where (hundred, thousand) in (select twothousand, twothousand from onek);
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part4.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part4.sql.out
new file mode 100644
index 0000000..b7bbdc5
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part4.sql.out
@@ -0,0 +1,5 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 0
+
+
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org