You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/08/03 06:31:52 UTC
[GitHub] [spark] viirya commented on a change in pull request #25340: [SPARK-28393][SQL][PYTHON][TESTS] Convert and port 'pgSQL/join.sql' into UDF test base

viirya commented on a change in pull request #25340: [SPARK-28393][SQL][PYTHON][TESTS] Convert and port 'pgSQL/join.sql' into UDF test base
URL: https://github.com/apache/spark/pull/25340#discussion_r310341346
 
 

 ##########
 File path: sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-join.sql
 ##########
 @@ -0,0 +1,2187 @@
+--
+-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+--
+--
+-- JOIN
+-- Test JOIN clauses
+-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/join.sql
+--
+-- This test file was converted from pgSQL/join.sql.
+
+CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM
+  (VALUES (0), (123456), (-123456), (2147483647), (-2147483647))
+  AS v(f1);
+CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM
+  (VALUES
+    (123, 456),
+    (123, 4567890123456789),
+    (4567890123456789, 123),
+    (4567890123456789, 4567890123456789),
+    (4567890123456789, -4567890123456789))
+  AS v(q1, q2);
+CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM
+  (VALUES (0.0), (1004.30), (-34.84),
+    (cast('1.2345678901234e+200' as double)), (cast('1.2345678901234e-200' as double)))
+  AS v(f1);
+CREATE OR REPLACE TEMPORARY VIEW TEXT_TBL AS SELECT * FROM
+  (VALUES ('doh!'), ('hi de ho neighbor'))
+  AS v(f1);
+CREATE OR REPLACE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1;
+
+CREATE TABLE J1_TBL (
+  i integer,
+  j integer,
+  t string
+) USING parquet;
+
+CREATE TABLE J2_TBL (
+  i integer,
+  k integer
+) USING parquet;
+
+
+INSERT INTO J1_TBL VALUES (1, 4, 'one');
+INSERT INTO J1_TBL VALUES (2, 3, 'two');
+INSERT INTO J1_TBL VALUES (3, 2, 'three');
+INSERT INTO J1_TBL VALUES (4, 1, 'four');
+INSERT INTO J1_TBL VALUES (5, 0, 'five');
+INSERT INTO J1_TBL VALUES (6, 6, 'six');
+INSERT INTO J1_TBL VALUES (7, 7, 'seven');
+INSERT INTO J1_TBL VALUES (8, 8, 'eight');
+INSERT INTO J1_TBL VALUES (0, NULL, 'zero');
+INSERT INTO J1_TBL VALUES (NULL, NULL, 'null');
+INSERT INTO J1_TBL VALUES (NULL, 0, 'zero');
+
+INSERT INTO J2_TBL VALUES (1, -1);
+INSERT INTO J2_TBL VALUES (2, 2);
+INSERT INTO J2_TBL VALUES (3, -3);
+INSERT INTO J2_TBL VALUES (2, 4);
+INSERT INTO J2_TBL VALUES (5, -5);
+INSERT INTO J2_TBL VALUES (5, -5);
+INSERT INTO J2_TBL VALUES (0, NULL);
+INSERT INTO J2_TBL VALUES (NULL, NULL);
+INSERT INTO J2_TBL VALUES (NULL, 0);
+
+-- [SPARK-20856] Do not need onerow because it only used for test statement using nested joins
+-- useful in some tests below
+-- create temp table onerow();
+-- insert into onerow default values;
+-- analyze onerow;
+
+
+--
+-- CORRELATION NAMES
+-- Make sure that table/column aliases are supported
+-- before diving into more complex join syntax.
+--
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t)
+  FROM J1_TBL AS tx;
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t)
+  FROM J1_TBL tx;
+
+SELECT udf('') AS `xxx`, udf(a), udf(b), udf(c)
+  FROM J1_TBL AS t1 (a, b, c);
+
+SELECT udf('') AS `xxx`, udf(a), udf(b), udf(c)
+  FROM J1_TBL t1 (a, b, c);
+
+SELECT udf('') AS `xxx`, udf(a), udf(b), udf(c), udf(d), udf(e)
+  FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e);
+
+-- [SPARK-28377] Fully support correlation names in the FROM clause
+-- SELECT '' AS "xxx", t1.a, t2.e
+--   FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e)
+--   WHERE t1.a = t2.d;
+
+
+--
+-- CROSS JOIN
+-- Qualifications are not allowed on cross joins,
+-- which degenerate into a standard unqualified inner join.
+--
+
+SELECT udf('') AS `xxx`, *
+  FROM J1_TBL CROSS JOIN J2_TBL;
+
+-- ambiguous column
+SELECT udf('') AS `xxx`, udf(i), udf(k), udf(t)
+  FROM J1_TBL CROSS JOIN J2_TBL;
+
+-- resolve previous ambiguity by specifying the table name
+SELECT udf('') AS `xxx`, udf(t1.i), udf(k), udf(t)
+  FROM J1_TBL t1 CROSS JOIN J2_TBL t2;
+
+SELECT udf('') AS `xxx`, udf(ii), udf(tt), udf(kk)
+  FROM (J1_TBL CROSS JOIN J2_TBL)
+    AS tx (ii, jj, tt, ii2, kk);
+
+-- [SPARK-28377] Fully support correlation names in the FROM clause
+-- SELECT '' AS `xxx`, tx.ii, tx.jj, tx.kk
+--   FROM (J1_TBL t1 (a, b, c) CROSS JOIN J2_TBL t2 (d, e))
+--     AS tx (ii, jj, tt, ii2, kk);
+
+SELECT udf('') AS `xxx`, udf(j1_tbl.i), udf(j), udf(t), udf(a.i), udf(a.k), udf(b.i),  udf(b.k)
+  FROM J1_TBL CROSS JOIN J2_TBL a CROSS JOIN J2_TBL b;
+
+
+--
+--
+-- Inner joins (equi-joins)
+--
+--
+
+--
+-- Inner joins (equi-joins) with USING clause
+-- The USING syntax changes the shape of the resulting table
+-- by including a column in the USING clause only once in the result.
+--
+
+-- Inner equi-join on specified column
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL INNER JOIN J2_TBL USING (i);
+
+-- Same as above, slightly different syntax
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL JOIN J2_TBL USING (i);
+
+SELECT udf('') AS `xxx`, *
+  FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, d) USING (a)
+  ORDER BY udf(a), udf(d);
+
+-- [SPARK-28377] Fully support correlation names in the FROM clause
+-- SELECT '' AS `xxx`, *
+--   FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, b) USING (b)
+--   ORDER BY b, t1.a;
+
+
+--
+-- NATURAL JOIN
+-- Inner equi-join on all columns with the same name
+--
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL NATURAL JOIN J2_TBL;
+
+SELECT udf('') AS `xxx`, udf(a), udf(b), udf(c), udf(d)
+  FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (a, d);
+
+SELECT udf('') AS `xxx`, udf(a), udf(b), udf(c), udf(d)
+  FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (d, a);
+
+-- [SPARK-28377] Fully support correlation names in the FROM clause
+-- mismatch number of columns
+-- currently, Postgres will fill in with underlying names
+-- SELECT '' AS `xxx`, *
+--   FROM J1_TBL t1 (a, b) NATURAL JOIN J2_TBL t2 (a);
+
+
+--
+-- Inner joins (equi-joins)
+--
+
+SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(J1_TBL.j), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k)
+  FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.i);
+
+SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(J1_TBL.j), udf(J1_TBL.t), J2_TBL.i, J2_TBL.k
+  FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.k);
+
+
+--
+-- Non-equi-joins
+--
+
+SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(J1_TBL.j), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k)
+  FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i <= J2_TBL.k);
+
+
+--
+-- Outer joins
+-- Note that OUTER is a noise word
+--
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i)
+  ORDER BY udf(i), udf(k), udf(t);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL LEFT JOIN J2_TBL USING (i)
+  ORDER BY udf(i), udf(k), udf(t);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL RIGHT OUTER JOIN J2_TBL USING (i);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL RIGHT JOIN J2_TBL USING (i);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i)
+  ORDER BY udf(i), udf(k), udf(t);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL FULL JOIN J2_TBL USING (i)
+  ORDER BY udf(i), udf(k), udf(t);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(k) = 1);
+
+SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k)
+  FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(i) = 1);
+
+--
+-- semijoin selectivity for <>
+--
+-- explain (costs off)
+-- select * from int4_tbl i4, tenk1 a
+-- where exists(select * from tenk1 b
+--              where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
+--       and i4.f1 = a.tenthous;
+
+
+--
+-- More complicated constructs
+--
+
+--
+-- Multiway full join
+--
+
+CREATE TABLE t1 (name STRING, n INTEGER) USING parquet;
+CREATE TABLE t2 (name STRING, n INTEGER) USING parquet;
+CREATE TABLE t3 (name STRING, n INTEGER) USING parquet;
+
+INSERT INTO t1 VALUES ( 'bb', 11 );
+INSERT INTO t2 VALUES ( 'bb', 12 );
+INSERT INTO t2 VALUES ( 'cc', 22 );
+INSERT INTO t2 VALUES ( 'ee', 42 );
+INSERT INTO t3 VALUES ( 'bb', 13 );
+INSERT INTO t3 VALUES ( 'cc', 23 );
+INSERT INTO t3 VALUES ( 'dd', 33 );
+
+SELECT * FROM t1 FULL JOIN t2 USING (name) FULL JOIN t3 USING (name);
+
+--
+-- Test interactions of join syntax and subqueries
+--
+
+-- Basic cases (we expect planner to pull up the subquery here)
+---comment out for now
+---+org.apache.spark.sql.AnalysisException
+---+USING column `name` cannot be resolved on the left side of the join. The left-side columns: [CAST(udf(cast(name as string)) AS STRING), CAST(udf(cast(n as string)) AS INT)];
+---SELECT * FROM
+---(SELECT udf(name), udf(t2.n) FROM t2) as s2
+---INNER JOIN
+---(SELECT udf(name), udf(t3.n) FROM t3) s3
+---USING (name);
+---+USING column `name` cannot be resolved on the left side of the join. The left-side columns: [CAST(udf(cast(name as string)) AS STRING), CAST(udf(cast(n as string)) AS INT)];
+SELECT * FROM
+(SELECT * FROM t2) as s2
+INNER JOIN
+(SELECT * FROM t3) s3
+USING (name);
+
+--- comment out for now
+---+org.apache.spark.sql.catalyst.parser.ParseException
+---+mismatched input '(' expecting {')', ',', '-'}(line 5, pos 10)
+---SELECT * FROM
+---(SELECT * FROM t2) as s2
+---LEFT JOIN
+---(SELECT * FROM t3) s3
+---USING (udf(name));
+SELECT * FROM
+(SELECT * FROM t2) as s2
+LEFT JOIN
+(SELECT * FROM t3) s3
+USING (name);
+
+SELECT udf(name), udf(s2.n), udf(s3.n) FROM
+(SELECT * FROM t2) as s2
+FULL JOIN
+(SELECT * FROM t3) s3
+USING (name);
+
+-- Cases with non-nullable expressions in subquery results;
+-- make sure these go to null as expected
+SELECT * FROM
+(SELECT udf(name), udf(n) as s2_n, udf(2) as s2_2 FROM t2) as s2
+NATURAL INNER JOIN
+(SELECT udf(name), udf(n) as s3_n, udf(3) as s3_2 FROM t3) s3;
+
+SELECT * FROM
+(SELECT udf(name), udf(n) as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL LEFT JOIN
+(SELECT udf(name), udf(n) as s3_n, 3 as s3_2 FROM t3) s3;
+
+SELECT * FROM
+(SELECT udf(name), udf(n) as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL FULL JOIN
+(SELECT udf(name), udf(n) as s3_n, 3 as s3_2 FROM t3) s3;
+
+SELECT * FROM
+(SELECT udf(name), udf(n) as s1_n, 1 as s1_1 FROM t1) as s1
+NATURAL INNER JOIN
+(SELECT udf(name), udf(n) as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL INNER JOIN
+(SELECT udf(name), udf(n) as s3_n, 3 as s3_2 FROM t3) s3;
+
+SELECT * FROM
+(SELECT udf(name), udf(n) as s1_n, udf(1) as s1_1 FROM t1) as s1
+NATURAL FULL JOIN
+(SELECT udf(name), udf(n) as s2_n, udf(2) as s2_2 FROM t2) as s2
+NATURAL FULL JOIN
+(SELECT udf(name), udf(n) as s3_n, udf(3) as s3_2 FROM t3) s3;
+
+SELECT name, udf(s1_n), udf(s2_n), udf(s3_n) FROM
+(SELECT name, udf(n) as s1_n FROM t1) as s1
+NATURAL FULL JOIN
+  (SELECT * FROM
+    (SELECT name, udf(n) as s2_n FROM t2) as s2
+    NATURAL FULL JOIN
+    (SELECT name, udf(n) as s3_n FROM t3) as s3
+  ) ss2;
+
+SELECT * FROM
+(SELECT name, n as s1_n FROM t1) as s1
+NATURAL FULL JOIN
+  (SELECT * FROM
+    (SELECT name, udf(n) as s2_n, 2 as s2_2 FROM t2) as s2
+    NATURAL FULL JOIN
+    (SELECT name, udf(n) as s3_n FROM t3) as s3
+  ) ss2;
+
+-- Constants as join keys can also be problematic
+SELECT s1.name, udf(s1_n), s2.name, udf(s2_n) FROM
+  (SELECT name, udf(n) as s1_n FROM t1) as s1
+FULL JOIN
+  (SELECT name, udf(2) as s2_n FROM t2) as s2
+ON (udf(s1_n) = udf(s2_n));
+
+
+-- Test for propagation of nullability constraints into sub-joins
+
+create or replace temporary view x as select * from
+  (values (1,11), (2,22), (3,null), (4,44), (5,null))
+  as v(x1, x2);
+
+create or replace temporary view y as select * from
+  (values (1,111), (2,222), (3,333), (4,null))
+  as v(y1, y2);
+
+select udf(x1), udf(x2) from x;
+select udf(y1), udf(y2) from y;
+
+select * from x left join y on (udf(x1) = udf(y1) and udf(x2) is not null);
+select * from x left join y on (udf(x1) = udf(y1) and udf(y2) is not null);
+
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1));
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1) and udf(x2) is not null);
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1) and udf(y2) is not null);
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1) and udf(xx2) is not null);
+-- these should NOT give the same answers as above
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1)) where (udf(x2) is not null);
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1)) where (udf(y2) is not null);
+select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2)
+on (udf(x1) = udf(xx1)) where (udf(xx2) is not null);
+
+--
+-- regression test: check for bug with propagation of implied equality
+-- to outside an IN
+--
+select udf(count(*)) from tenk1 a where udf(unique1) in
+  (select udf(unique1) from tenk1 b join tenk1 c using (unique1)
+   where udf(b.unique2) = udf(42));
+
+--
+-- regression test: check for failure to generate a plan with multiple
+-- degenerate IN clauses
+--
+select udf(count(*)) from tenk1 x where
+  udf(x.unique1) in (select udf(a.f1) from int4_tbl a,float8_tbl b where udf(a.f1)=udf(b.f1)) and
+  udf(x.unique1) = 0 and
+  udf(x.unique1) in (select aa.f1 from int4_tbl aa,float8_tbl bb where udf(aa.f1)=udf(bb.f1));
+
+-- try that with GEQO too
+-- begin;
+-- set geqo = on;
+-- set geqo_threshold = 2;
+select udf(count(*)) from tenk1 x where
+  udf(x.unique1) in (select udf(a.f1) from int4_tbl a,float8_tbl b where udf(a.f1)=udf(b.f1)) and
+  udf(x.unique1) = 0 and
+  udf(x.unique1) in (select udf(aa.f1) from int4_tbl aa,float8_tbl bb where udf(aa.f1)=udf(bb.f1));
+-- rollback;
+
+-- Skip this test because table b inherits from table a and we do not support this feature, see inherits.sql
+--
+-- regression test: be sure we cope with proven-dummy append rels
+--
+-- explain (costs off)
+-- select aa, bb, unique1, unique1
+--   from tenk1 right join b on aa = unique1
+--   where bb < bb and bb is null;
+
+-- select aa, bb, unique1, unique1
+--   from tenk1 right join b on aa = unique1
+--   where bb < bb and bb is null;
+
+--
+-- regression test: check handling of empty-FROM subquery underneath outer join
+--
+-- explain (costs off)
+-- select * from int8_tbl i1 left join (int8_tbl i2 join
+--   (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2
+-- order by 1, 2;
+
+select * from int8_tbl i1 left join (int8_tbl i2 join
+  (select udf(123) as x) ss on udf(i2.q1) = udf(x)) on udf(i1.q2) = udf(i2.q2)
+order by udf(1), 2;
+
+--
+-- regression test: check a case where join_clause_is_movable_into() gives
+-- an imprecise result, causing an assertion failure
+--
+select udf(count(*))
+from
+  (select udf(t3.tenthous) as x1, udf(coalesce(udf(t1.stringu1), udf(t2.stringu1))) as x2
+   from tenk1 t1
+   left join tenk1 t2 on udf(t1.unique1) = udf(t2.unique1)
+   join tenk1 t3 on t1.unique2 = t3.unique2) ss,
+  tenk1 t4,
+  tenk1 t5
+where udf(t4.thousand) = udf(t5.unique1) and udf(ss.x1) = udf(t4.tenthous) and udf(ss.x2) = udf(t5.stringu1);
+
+--
+-- regression test: check a case where we formerly missed including an EC
+-- enforcement clause because it was expected to be handled at scan level
+--
+-- explain (costs off)
+-- select a.f1, b.f1, t.thousand, t.tenthous from
+--   tenk1 t,
+--   (select sum(f1)+1 as f1 from int4_tbl i4a) a,
+--   (select sum(f1) as f1 from int4_tbl i4b) b
+-- where b.f1 = t.thousand and a.f1 = b.f1 and (a.f1+b.f1+999) = t.tenthous;
+
+select udf(a.f1), udf(b.f1), udf(t.thousand), udf(t.tenthous) from
+  tenk1 t,
+  (select udf(sum(udf(f1))+1) as f1 from int4_tbl i4a) a,
+  (select udf(sum(udf(f1))) as f1 from int4_tbl i4b) b
+where udf(b.f1) = udf(t.thousand) and udf(a.f1) = udf(b.f1) and udf((udf(a.f1)+udf(b.f1)+999)) = udf(t.tenthous);
+
+--
+-- check a case where we formerly got confused by conflicting sort orders
+-- in redundant merge join path keys
+--
+-- explain (costs off)
+-- select * from
+--   j1_tbl full join
+--   (select * from j2_tbl order by j2_tbl.i desc, j2_tbl.k asc) j2_tbl
+--   on j1_tbl.i = j2_tbl.i and j1_tbl.i = j2_tbl.k;
+
+select * from
+  j1_tbl full join
+  (select * from j2_tbl order by udf(j2_tbl.i) desc, udf(j2_tbl.k) asc) j2_tbl
+  on udf(j1_tbl.i) = udf(j2_tbl.i) and udf(j1_tbl.i) = udf(j2_tbl.k);
+
+--
+-- a different check for handling of redundant sort keys in merge joins
+--
+-- explain (costs off)
+-- select count(*) from
+--   (select * from tenk1 x order by x.thousand, x.twothousand, x.fivethous) x
+--   left join
+--   (select * from tenk1 y order by y.unique2) y
+--   on x.thousand = y.unique2 and x.twothousand = y.hundred and x.fivethous = y.unique2;
+
+select udf(count(*)) from
+  (select * from tenk1 x order by udf(x.thousand), udf(x.twothousand), x.fivethous) x
+  left join
+  (select * from tenk1 y order by udf(y.unique2)) y
+  on udf(x.thousand) = y.unique2 and x.twothousand = udf(y.hundred) and x.fivethous = y.unique2;
+
+
+--
+-- Clean up
+--
+
+DROP TABLE t1;
+DROP TABLE t2;
+DROP TABLE t3;
+
+DROP TABLE J1_TBL;
+DROP TABLE J2_TBL;
+
+-- Both DELETE and UPDATE allow the specification of additional tables
+-- to "join" against to determine which rows should be modified.
+
+-- CREATE TEMP TABLE t1 (a int, b int);
+-- CREATE TEMP TABLE t2 (a int, b int);
+-- CREATE TEMP TABLE t3 (x int, y int);
+
+-- INSERT INTO t1 VALUES (5, 10);
+-- INSERT INTO t1 VALUES (15, 20);
+-- INSERT INTO t1 VALUES (100, 100);
+-- INSERT INTO t1 VALUES (200, 1000);
+-- INSERT INTO t2 VALUES (200, 2000);
+-- INSERT INTO t3 VALUES (5, 20);
+-- INSERT INTO t3 VALUES (6, 7);
+-- INSERT INTO t3 VALUES (7, 8);
+-- INSERT INTO t3 VALUES (500, 100);
+
+-- DELETE FROM t3 USING t1 table1 WHERE t3.x = table1.a;
+-- SELECT * FROM t3;
+-- DELETE FROM t3 USING t1 JOIN t2 USING (a) WHERE t3.x > t1.a;
+-- SELECT * FROM t3;
+-- DELETE FROM t3 USING t3 t3_other WHERE t3.x = t3_other.x AND t3.y = t3_other.y;
+-- SELECT * FROM t3;
+
+-- Test join against inheritance tree
+
+-- create temp table t2a () inherits (t2);
+
+-- insert into t2a values (200, 2001);
+
+-- select * from t1 left join t2 on (t1.a = t2.a);
+
+-- Test matching of column name with wrong alias
+
+-- select t1.x from t1 join t3 on (t1.a = t3.x);
+
+--
+-- regression test for 8.1 merge right join bug
+--
+
+create or replace temporary view tt1 as select * from
+  (values (1, 11), (2, NULL))
+  as v(tt1_id, joincol);
+
+create or replace temporary view tt2 as select * from
+  (values (21, 11), (22, 11))
+  as v(tt2_id, joincol);
+
+-- set enable_hashjoin to off;
+-- set enable_nestloop to off;
+
+-- these should give the same results
+
+select tt1.*, tt2.* from tt1 left join tt2 on udf(tt1.joincol) = udf(tt2.joincol);
+
+select tt1.*, tt2.* from tt2 right join tt1 on udf(tt1.joincol) = udf(tt2.joincol);
+
+-- reset enable_hashjoin;
+-- reset enable_nestloop;
+
+--
+-- regression test for bug #13908 (hash join with skew tuples & nbatch increase)
+--
+
+-- set work_mem to '64kB';
+-- set enable_mergejoin to off;
+
+-- explain (costs off)
+-- select count(*) from tenk1 a, tenk1 b
+--   where a.hundred = b.thousand and (b.fivethous % 10) < 10;
+select udf(count(*)) from tenk1 a, tenk1 b
+  where udf(a.hundred) = udf(b.thousand) and udf(udf((b.fivethous % 10)) < 10);
+
+-- reset work_mem;
+-- reset enable_mergejoin;
+
+--
+-- regression test for 8.2 bug with improper re-ordering of left joins
+--
+
+DROP TABLE IF EXISTS tt3;
+CREATE TABLE tt3(f1 int, f2 string) USING parquet;
+INSERT INTO tt3 SELECT x.id, repeat('xyzzy', 100) FROM range(1,10001) x;
+-- create index tt3i on tt3(f1);
+-- analyze tt3;
+
+DROP TABLE IF EXISTS tt4;
+CREATE TABLE tt4(f1 int) USING parquet;
+INSERT INTO tt4 VALUES (0),(1),(9999);
+-- analyze tt4;
+
+---comment out for now
+---problem caused by SELECT udf(b.f1)
+---+org.apache.spark.sql.AnalysisException
+---+cannot resolve '`d.f1`' given input columns: [d.CAST(udf(cast(f1 as string)) AS INT), a.f1]; line 7 pos 26
+---SELECT udf(a.f1)
+---FROM tt4 a
+---LEFT JOIN (
+---        SELECT udf(b.f1)
+---        FROM tt3 b LEFT JOIN tt3 c ON udf(b.f1) = udf(c.f1)
+---        WHERE udf(c.f1) IS NULL
+---) AS d ON udf(a.f1) = udf(d.f1)
+---WHERE udf(d.f1) IS NULL;
+SELECT udf(a.f1)
+FROM tt4 a
+LEFT JOIN (
+        SELECT b.f1
+        FROM tt3 b LEFT JOIN tt3 c ON udf(b.f1) = udf(c.f1)
+        WHERE udf(c.f1) IS NULL
+) AS d ON udf(a.f1) = udf(d.f1)
+WHERE udf(d.f1) IS NULL;
+
+--
+-- regression test for proper handling of outer joins within antijoins
+--
+
+-- create temp table tt4x(c1 int, c2 int, c3 int);
+
+-- explain (costs off)
+-- select * from tt4x t1
+-- where not exists (
+--   select 1 from tt4x t2
+--     left join tt4x t3 on t2.c3 = t3.c1
+--     left join ( select t5.c1 as c1
+--                 from tt4x t4 left join tt4x t5 on t4.c2 = t5.c1
+--               ) a1 on t3.c2 = a1.c1
+--   where t1.c1 = t2.c2
+-- );
+
+--
+-- regression test for problems of the sort depicted in bug #3494
+--
+
+create or replace temporary view tt5 as select * from
+  (values (1, 10), (1, 11))
+  as v(f1, f2);
+create or replace temporary view tt6 as select * from
+  (values (1, 9), (1, 2), (2, 9))
+  as v(f1, f2);
+
+select * from tt5,tt6 where udf(tt5.f1) = udf(tt6.f1) and udf(tt5.f1) = udf(tt5.f2 - tt6.f2);
+
+--
+-- regression test for problems of the sort depicted in bug #3588
+--
+
+create or replace temporary view xx as select * from
+  (values (1), (2), (3))
+  as v(pkxx);
+create or replace temporary view yy as select * from
+  (values (101, 1), (201, 2), (301, NULL))
+  as v(pkyy, pkxx);
+
+select udf(yy.pkyy) as yy_pkyy, udf(yy.pkxx) as yy_pkxx, udf(yya.pkyy) as yya_pkyy,
+       udf(xxa.pkxx) as xxa_pkxx, udf(xxb.pkxx) as xxb_pkxx
+from yy
+     left join (SELECT * FROM yy where pkyy = 101) as yya ON udf(yy.pkyy) = udf(yya.pkyy)
+     left join xx xxa on udf(yya.pkxx) = udf(xxa.pkxx)
+     left join xx xxb on udf(coalesce (xxa.pkxx, 1)) = udf(xxb.pkxx);
+
+--
+-- regression test for improper pushing of constants across outer-join clauses
+-- (as seen in early 8.2.x releases)
+--
+
+create or replace temporary view zt1 as select * from
+  (values (53))
+  as v(f1);
+create or replace temporary view zt2 as select * from
+  (values (53))
+  as v(f2);
+create or replace temporary view zt3(f3 int) using parquet;
+
+select * from
+  zt2 left join zt3 on (udf(f2) = udf(f3))
+      left join zt1 on (udf(f3) = udf(f1))
+where udf(f2) = 53;
+
+create temp view zv1 as select *,'dummy' AS junk from zt1;
+
+select * from
+  zt2 left join zt3 on (udf(f2) = udf(f3))
+      left join zv1 on (udf(f3) = udf(f1))
+where udf(f2) = udf(53);
+
+--
+-- regression test for improper extraction of OR indexqual conditions
+-- (as seen in early 8.3.x releases)
+--
+
+select udf(a.unique2), udf(a.ten), udf(b.tenthous), udf(b.unique2), udf(b.hundred)
+from tenk1 a left join tenk1 b on udf(a.unique2) = udf(b.tenthous)
+where udf(a.unique1) = 42 and
+      ((udf(b.unique2) is null and udf(a.ten) = 2) or udf(b.hundred) = udf(3));
+
+--
+-- test proper positioning of one-time quals in EXISTS (8.4devel bug)
+--
+-- prepare foo(bool) as
+--   select count(*) from tenk1 a left join tenk1 b
+--     on (a.unique2 = b.unique1 and exists
+--         (select 1 from tenk1 c where c.thousand = b.unique2 and $1));
+-- execute foo(true);
+-- execute foo(false);
+
+--
+-- test for sane behavior with noncanonical merge clauses, per bug #4926
+--
+
+-- begin;
+
+-- set enable_mergejoin = 1;
+-- set enable_hashjoin = 0;
+-- set enable_nestloop = 0;
+
+create or replace temporary view a (i integer) using parquet;
+create or replace temporary view b (x integer, y integer) using parquet;
+
+select * from a left join b on udf(i) = udf(x) and udf(i) = udf(y) and udf(x) = udf(i);
+
+-- rollback;
+
+--
+-- test handling of merge clauses using record_ops
+--
+-- begin;
+
+-- create type mycomptype as (id int, v bigint);
+
+-- create temp table tidv (idv mycomptype);
+-- create index on tidv (idv);
+
+-- explain (costs off)
+-- select a.idv, b.idv from tidv a, tidv b where a.idv = b.idv;
+
+-- set enable_mergejoin = 0;
+
+-- explain (costs off)
+-- select a.idv, b.idv from tidv a, tidv b where a.idv = b.idv;
+
+-- rollback;
+
+--
+-- test NULL behavior of whole-row Vars, per bug #5025
+--
+--- comment out for now
+--- problem caused by `group by udf(t1.q2)`
+--- -- !query 130 schema
+----struct<q2:bigint,count(q1, q2):bigint>
+---+struct<>
+--- -- !query 130 output
+-----4567890123456789      0
+----123    2
+----456    0
+----4567890123456789       6
+---+org.apache.spark.sql.AnalysisException
+---+expression 't1.`q2`' is neither present in the group by, nor is it an aggregate f
+---unction. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
+---select udf(t1.q2), udf(count(t2.*))
+---from int8_tbl t1 left join int8_tbl t2 on (udf(t1.q2) = udf(t2.q1))
+---group by udf(t1.q2) order by 1;
+select udf(t1.q2), udf(count(t2.*))
+from int8_tbl t1 left join int8_tbl t2 on (udf(t1.q2) = udf(t2.q1))
+group by t1.q2 order by 1;
+
+select udf(t1.q2), udf(count(t2.*))
+from int8_tbl t1 left join (select * from int8_tbl) t2 on (udf(t1.q2) = udf(t2.q1))
+group by t1.q2 order by 1;
+
+-- [SPARK-28330] Enhance query limit
+-- select t1.q2, count(t2.*)
+-- from int8_tbl t1 left join (select * from int8_tbl offset 0) t2 on (t1.q2 = t2.q1)
+-- group by t1.q2 order by 1;
+
+---comment out for now
+---problem caused by select udf(q1)   remove udf works ok
+---+org.apache.spark.sql.AnalysisException
+---+cannot resolve '`t2.q1`' given input columns: [t2.CAST(udf(cast(q1 as string)) AS BIGINT), t1.q1, t1.q2, t2.q2]; line 4 pos 23
+---select udf(t1.q2), udf(count(t2.*))
+---from int8_tbl t1 left join
+---  (select udf(q1), case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2
+---  on (udf(t1.q2) = udf(t2.q1))
+---group by t1.q2 order by 1;
+select udf(t1.q2), udf(count(t2.*))
+from int8_tbl t1 left join
+  (select q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2
+  on (udf(t1.q2) = udf(t2.q1))
+group by t1.q2 order by 1;
+
+--
+-- test incorrect failure to NULL pulled-up subexpressions
+--
+-- begin;
+create or replace temporary view a as select * from
+  (values ('p'), ('q'))
+  as v(code);
+create or replace temporary view b as select * from
+  (values ('p', 1), ('p', 2))
+  as v(a, num);
+create or replace temporary view c as select * from
+  (values ('A', 'p'), ('B', 'q'), ('C', null))
+  as v(name, a);
+
+
+--- comment out for now   query136
+--- problem caused by udf(b.a)
+---+org.apache.spark.sql.AnalysisException
+---+cannot resolve '`b_grp.a`' given input columns: [b_grp.CAST(udf(cast(a as string)) AS STRING), b_grp.cnt, a.code]; line 6 pos 26
+---select udf(c.name), udf(ss.code), udf(ss.b_cnt), udf(ss.const)
+---from c left join
+---  (select a.code, coalesce(b_grp.cnt, 0) as b_cnt, -1 as const
+---   from a left join
+---     (select udf(count(1)) as cnt, udf(b.a) from b group by b.a) as b_grp
+---     on udf(a.code) = udf(b_grp.a)
+---  ) as ss
+---  on (udf(c.a) = udf(ss.code))
+---order by c.name;
+select udf(c.name), udf(ss.code), udf(ss.b_cnt), udf(ss.const)
+from c left join
+  (select a.code, coalesce(b_grp.cnt, 0) as b_cnt, -1 as const
+   from a left join
+     (select udf(count(1)) as cnt, b.a from b group by b.a) as b_grp
+     on udf(a.code) = udf(b_grp.a)
+  ) as ss
+  on (udf(c.a) = udf(ss.code))
+order by c.name;
+
+-- rollback;
+
+--
+-- test incorrect handling of placeholders that only appear in targetlists,
+-- per bug #6154
+--
+---comment out for now  query 137
+---promblem caused by SELECT udf(sub5.key5),
+--+org.apache.spark.sql.AnalysisException
+---+cannot resolve '`sub4.key5`' given input columns: [sub4.CAST(udf(cast(key5 as string)) AS INT), sub3.key3, sub4.value2]; line 13 pos 11
+---SELECT * FROM
+---( SELECT 1 as key1 ) sub1
+---LEFT JOIN
+---( SELECT sub3.key3, sub4.value2, COALESCE(sub4.value2, 66) as value3 FROM
+---    ( SELECT 1 as key3 ) sub3
+---    LEFT JOIN
+---    ( SELECT udf(sub5.key5), udf(COALESCE(sub6.value1, 1)) as value2 FROM
+---        ( SELECT 1 as key5 ) sub5
+---        LEFT JOIN
+---        ( SELECT 2 as key6, 42 as value1 ) sub6
+---        ON udf(sub5.key5) = udf(sub6.key6)
+---    ) sub4
+---    ON udf(sub4.key5) = udf(sub3.key3)
+---) sub2
+---ON udf(sub1.key1) = udf(sub2.key3);
+SELECT * FROM
+( SELECT 1 as key1 ) sub1
+LEFT JOIN
+( SELECT sub3.key3, sub4.value2, COALESCE(sub4.value2, 66) as value3 FROM
+    ( SELECT 1 as key3 ) sub3
+    LEFT JOIN
+    ( SELECT sub5.key5, udf(COALESCE(sub6.value1, 1)) as value2 FROM
+        ( SELECT 1 as key5 ) sub5
+        LEFT JOIN
+        ( SELECT 2 as key6, 42 as value1 ) sub6
+        ON udf(sub5.key5) = udf(sub6.key6)
+    ) sub4
+    ON udf(sub4.key5) = udf(sub3.key3)
+) sub2
+ON udf(sub1.key1) = udf(sub2.key3);
+
+-- test the path using join aliases, too
+---comment out for now query 138
+---problem caused by  udf(sub3.key3)
+---+org.apache.spark.sql.AnalysisException
+---+cannot resolve '`sub2.key3`' given input columns: [sub2.CAST(udf(cast(key3 as string)) AS INT), sub2.CAST(udf(cast(value2 as string)) AS INT), sub1.key1, sub2.value3]; line 15 pos 15
+---SELECT * FROM
+---( SELECT 1 as key1 ) sub1
+---LEFT JOIN
+---( SELECT udf(sub3.key3), udf(value2), udf(COALESCE(value2, 66)) as value3 FROM
+---    ( SELECT 1 as key3 ) sub3
+---    LEFT JOIN
+---    ( SELECT sub5.key5, COALESCE(sub6.value1, 1) as value2 FROM
+---        ( SELECT 1 as key5 ) sub5
+---        LEFT JOIN
+---        ( SELECT 2 as key6, 42 as value1 ) sub6
+---        ON sub5.key5 = sub6.key6
+---    ) sub4
+---    ON sub4.key5 = sub3.key3
+---) sub2
+---ON sub1.key1 = sub2.key3;
 
 Review comment:
   I think here is the problem? Because there is no key3 in sub2 now. You might need to write:
   
   ```
   ...
   SELECT udf(sub3.key3) as key3, udf(value2), udf(COALESCE(value2, 66)) as value3 FROM
   ...
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org