You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (Jira)" <ji...@apache.org> on 2020/05/14 00:28:00 UTC
[jira] [Updated] (SPARK-31705) Rewrite join condition to
conjunctive normal form
[ https://issues.apache.org/jira/browse/SPARK-31705?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Yuming Wang updated SPARK-31705:
--------------------------------
Description:
Rewrite join condition to [conjunctive normal form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more conditions to filter.
PostgreSQL:
{code:sql}
CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,
l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),
l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),
l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,
l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
CREATE TABLE orders (
o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),
o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));
EXPLAIN
SELECT Count(*)
FROM lineitem,
orders
WHERE l_orderkey = o_orderkey
AND ( ( l_suppkey > 3
AND o_custkey > 13 )
OR ( l_suppkey > 1
AND o_custkey > 11 ) )
AND l_partkey > 19;
EXPLAIN
SELECT Count(*)
FROM lineitem
JOIN orders
ON l_orderkey = o_orderkey
AND ( ( l_suppkey > 3
AND o_custkey > 13 )
OR ( l_suppkey > 1
AND o_custkey > 11 ) )
AND l_partkey > 19;
{code}
{noformat}
postgres=# EXPLAIN
postgres-# SELECT Count(*)
postgres-# FROM lineitem,
postgres-# orders
postgres-# WHERE l_orderkey = o_orderkey
postgres-# AND ( ( l_suppkey > 3
postgres(# AND o_custkey > 13 )
postgres(# OR ( l_suppkey > 1
postgres(# AND o_custkey > 11 ) )
postgres-# AND l_partkey > 19;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=21.18..21.19 rows=1 width=8)
-> Hash Join (cost=10.60..21.17 rows=2 width=0)
Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
-> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16)
Filter: ((o_custkey > 13) OR (o_custkey > 11))
-> Hash (cost=10.53..10.53 rows=6 width=16)
-> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16)
Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
(9 rows)
postgres=# EXPLAIN
postgres-# SELECT Count(*)
postgres-# FROM lineitem
postgres-# JOIN orders
postgres-# ON l_orderkey = o_orderkey
postgres-# AND ( ( l_suppkey > 3
postgres(# AND o_custkey > 13 )
postgres(# OR ( l_suppkey > 1
postgres(# AND o_custkey > 11 ) )
postgres-# AND l_partkey > 19;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=21.18..21.19 rows=1 width=8)
-> Hash Join (cost=10.60..21.17 rows=2 width=0)
Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
-> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16)
Filter: ((o_custkey > 13) OR (o_custkey > 11))
-> Hash (cost=10.53..10.53 rows=6 width=16)
-> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16)
Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
(9 rows)
{noformat}
was:
Rewrite join condition to [conjunctive normal form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more conditions to filter.
PostgreSQL:
{code:sql}
CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,
l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),
l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),
l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,
l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
CREATE TABLE orders (
o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),
o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));
explain select count(*) from lineitem, orders
where l_orderkey = o_orderkey
and ((l_suppkey > 10 and o_custkey > 20)
or (l_suppkey > 30 and o_custkey > 40))
and l_partkey > 0;
explain select count(*) from lineitem join orders
on l_orderkey = o_orderkey
and ((l_suppkey > 10 and o_custkey > 20)
or (l_suppkey > 30 and o_custkey > 40))
and l_partkey > 0;
{code}
{noformat}
postgres=# explain select count(*) from lineitem, orders
postgres-# where l_orderkey = o_orderkey
postgres-# and ((l_suppkey > 10 and o_custkey > 20)
postgres(# or (l_suppkey > 30 and o_custkey > 40))
postgres-# and l_partkey > 0;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=21.18..21.19 rows=1 width=8)
-> Hash Join (cost=10.60..21.17 rows=2 width=0)
Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
Join Filter: (((lineitem.l_suppkey > 10) AND (orders.o_custkey > 20)) OR ((lineitem.l_suppkey > 30) AND (orders.o_custkey > 40)))
-> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16)
Filter: ((o_custkey > 20) OR (o_custkey > 40))
-> Hash (cost=10.53..10.53 rows=6 width=16)
-> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16)
Filter: ((l_partkey > 0) AND ((l_suppkey > 10) OR (l_suppkey > 30)))
(9 rows)
postgres=#
postgres=# explain select count(*) from lineitem join orders
postgres-# on l_orderkey = o_orderkey
postgres-# and ((l_suppkey > 10 and o_custkey > 20)
postgres(# or (l_suppkey > 30 and o_custkey > 40))
postgres-# and l_partkey > 0;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=21.18..21.19 rows=1 width=8)
-> Hash Join (cost=10.60..21.17 rows=2 width=0)
Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
Join Filter: (((lineitem.l_suppkey > 10) AND (orders.o_custkey > 20)) OR ((lineitem.l_suppkey > 30) AND (orders.o_custkey > 40)))
-> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16)
Filter: ((o_custkey > 20) OR (o_custkey > 40))
-> Hash (cost=10.53..10.53 rows=6 width=16)
-> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16)
Filter: ((l_partkey > 0) AND ((l_suppkey > 10) OR (l_suppkey > 30)))
(9 rows)
{noformat}
> Rewrite join condition to conjunctive normal form
> -------------------------------------------------
>
> Key: SPARK-31705
> URL: https://issues.apache.org/jira/browse/SPARK-31705
> Project: Spark
> Issue Type: New Feature
> Components: SQL
> Affects Versions: 3.1.0
> Reporter: Yuming Wang
> Assignee: Yuming Wang
> Priority: Major
>
> Rewrite join condition to [conjunctive normal form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more conditions to filter.
> PostgreSQL:
> {code:sql}
> CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,
> l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),
> l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),
> l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,
> l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
>
> CREATE TABLE orders (
> o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),
> o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
> o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));
> EXPLAIN
> SELECT Count(*)
> FROM lineitem,
> orders
> WHERE l_orderkey = o_orderkey
> AND ( ( l_suppkey > 3
> AND o_custkey > 13 )
> OR ( l_suppkey > 1
> AND o_custkey > 11 ) )
> AND l_partkey > 19;
> EXPLAIN
> SELECT Count(*)
> FROM lineitem
> JOIN orders
> ON l_orderkey = o_orderkey
> AND ( ( l_suppkey > 3
> AND o_custkey > 13 )
> OR ( l_suppkey > 1
> AND o_custkey > 11 ) )
> AND l_partkey > 19;
> {code}
> {noformat}
> postgres=# EXPLAIN
> postgres-# SELECT Count(*)
> postgres-# FROM lineitem,
> postgres-# orders
> postgres-# WHERE l_orderkey = o_orderkey
> postgres-# AND ( ( l_suppkey > 3
> postgres(# AND o_custkey > 13 )
> postgres(# OR ( l_suppkey > 1
> postgres(# AND o_custkey > 11 ) )
> postgres-# AND l_partkey > 19;
> QUERY PLAN
> -----------------------------------------------------------------------------------------------------------------------------------------
> Aggregate (cost=21.18..21.19 rows=1 width=8)
> -> Hash Join (cost=10.60..21.17 rows=2 width=0)
> Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
> Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
> -> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16)
> Filter: ((o_custkey > 13) OR (o_custkey > 11))
> -> Hash (cost=10.53..10.53 rows=6 width=16)
> -> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16)
> Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
> (9 rows)
> postgres=# EXPLAIN
> postgres-# SELECT Count(*)
> postgres-# FROM lineitem
> postgres-# JOIN orders
> postgres-# ON l_orderkey = o_orderkey
> postgres-# AND ( ( l_suppkey > 3
> postgres(# AND o_custkey > 13 )
> postgres(# OR ( l_suppkey > 1
> postgres(# AND o_custkey > 11 ) )
> postgres-# AND l_partkey > 19;
> QUERY PLAN
> -----------------------------------------------------------------------------------------------------------------------------------------
> Aggregate (cost=21.18..21.19 rows=1 width=8)
> -> Hash Join (cost=10.60..21.17 rows=2 width=0)
> Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
> Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
> -> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16)
> Filter: ((o_custkey > 13) OR (o_custkey > 11))
> -> Hash (cost=10.53..10.53 rows=6 width=16)
> -> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16)
> Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
> (9 rows)
> {noformat}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org