You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (Jira)" <ji...@apache.org> on 2020/05/14 00:28:00 UTC

[jira] [Updated] (SPARK-31705) Rewrite join condition to conjunctive normal form

     [ https://issues.apache.org/jira/browse/SPARK-31705?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Yuming Wang updated SPARK-31705:
--------------------------------
    Description: 
Rewrite join condition to [conjunctive normal form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more conditions to filter.

PostgreSQL:
{code:sql}
CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,   
l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),        
l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),               
l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,    
l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
  
CREATE TABLE orders (                                                    
o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),           
o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));  

EXPLAIN
SELECT Count(*)
FROM   lineitem,
       orders
WHERE  l_orderkey = o_orderkey
       AND ( ( l_suppkey > 3
               AND o_custkey > 13 )
              OR ( l_suppkey > 1
                   AND o_custkey > 11 ) )
       AND l_partkey > 19;

EXPLAIN
SELECT Count(*)
FROM   lineitem
       JOIN orders
         ON l_orderkey = o_orderkey
            AND ( ( l_suppkey > 3
                    AND o_custkey > 13 )
                   OR ( l_suppkey > 1
                        AND o_custkey > 11 ) )
            AND l_partkey > 19;
{code}



{noformat}
postgres=# EXPLAIN
postgres-# SELECT Count(*)
postgres-# FROM   lineitem,
postgres-#        orders
postgres-# WHERE  l_orderkey = o_orderkey
postgres-#        AND ( ( l_suppkey > 3
postgres(#                AND o_custkey > 13 )
postgres(#               OR ( l_suppkey > 1
postgres(#                    AND o_custkey > 11 ) )
postgres-#        AND l_partkey > 19;
                                                               QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=21.18..21.19 rows=1 width=8)
   ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
         Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
         Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
         ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
               Filter: ((o_custkey > 13) OR (o_custkey > 11))
         ->  Hash  (cost=10.53..10.53 rows=6 width=16)
               ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                     Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
(9 rows)

postgres=# EXPLAIN
postgres-# SELECT Count(*)
postgres-# FROM   lineitem
postgres-#        JOIN orders
postgres-#          ON l_orderkey = o_orderkey
postgres-#             AND ( ( l_suppkey > 3
postgres(#                     AND o_custkey > 13 )
postgres(#                    OR ( l_suppkey > 1
postgres(#                         AND o_custkey > 11 ) )
postgres-#             AND l_partkey > 19;
                                                               QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=21.18..21.19 rows=1 width=8)
   ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
         Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
         Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
         ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
               Filter: ((o_custkey > 13) OR (o_custkey > 11))
         ->  Hash  (cost=10.53..10.53 rows=6 width=16)
               ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                     Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
(9 rows)
{noformat}




  was:
Rewrite join condition to [conjunctive normal form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more conditions to filter.

PostgreSQL:
{code:sql}
CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,   
l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),        
l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),               
l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,    
l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
  
CREATE TABLE orders (                                                    
o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),           
o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));  

explain select count(*) from lineitem, orders
 where l_orderkey = o_orderkey
 and ((l_suppkey > 10 and o_custkey > 20)
      or (l_suppkey > 30 and o_custkey > 40))
 and l_partkey > 0;

explain select count(*) from lineitem join orders
 on l_orderkey = o_orderkey
 and ((l_suppkey > 10 and o_custkey > 20)
      or (l_suppkey > 30 and o_custkey > 40))
 and l_partkey > 0;
{code}



{noformat}
postgres=# explain select count(*) from lineitem, orders
postgres-#  where l_orderkey = o_orderkey
postgres-#  and ((l_suppkey > 10 and o_custkey > 20)
postgres(#       or (l_suppkey > 30 and o_custkey > 40))
postgres-#  and l_partkey > 0;
                                                                QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=21.18..21.19 rows=1 width=8)
   ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
         Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
         Join Filter: (((lineitem.l_suppkey > 10) AND (orders.o_custkey > 20)) OR ((lineitem.l_suppkey > 30) AND (orders.o_custkey > 40)))
         ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
               Filter: ((o_custkey > 20) OR (o_custkey > 40))
         ->  Hash  (cost=10.53..10.53 rows=6 width=16)
               ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                     Filter: ((l_partkey > 0) AND ((l_suppkey > 10) OR (l_suppkey > 30)))
(9 rows)

postgres=#
postgres=# explain select count(*) from lineitem join orders
postgres-#  on l_orderkey = o_orderkey
postgres-#  and ((l_suppkey > 10 and o_custkey > 20)
postgres(#       or (l_suppkey > 30 and o_custkey > 40))
postgres-#  and l_partkey > 0;
                                                                QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=21.18..21.19 rows=1 width=8)
   ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
         Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
         Join Filter: (((lineitem.l_suppkey > 10) AND (orders.o_custkey > 20)) OR ((lineitem.l_suppkey > 30) AND (orders.o_custkey > 40)))
         ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
               Filter: ((o_custkey > 20) OR (o_custkey > 40))
         ->  Hash  (cost=10.53..10.53 rows=6 width=16)
               ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
                     Filter: ((l_partkey > 0) AND ((l_suppkey > 10) OR (l_suppkey > 30)))
(9 rows)
{noformat}





> Rewrite join condition to conjunctive normal form
> -------------------------------------------------
>
>                 Key: SPARK-31705
>                 URL: https://issues.apache.org/jira/browse/SPARK-31705
>             Project: Spark
>          Issue Type: New Feature
>          Components: SQL
>    Affects Versions: 3.1.0
>            Reporter: Yuming Wang
>            Assignee: Yuming Wang
>            Priority: Major
>
> Rewrite join condition to [conjunctive normal form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more conditions to filter.
> PostgreSQL:
> {code:sql}
> CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT,   
> l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0),        
> l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255),               
> l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE,    
> l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255));
>   
> CREATE TABLE orders (                                                    
> o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255),           
> o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255),
> o_clerk varchar(255), o_shippriority INT, o_comment varchar(255));  
> EXPLAIN
> SELECT Count(*)
> FROM   lineitem,
>        orders
> WHERE  l_orderkey = o_orderkey
>        AND ( ( l_suppkey > 3
>                AND o_custkey > 13 )
>               OR ( l_suppkey > 1
>                    AND o_custkey > 11 ) )
>        AND l_partkey > 19;
> EXPLAIN
> SELECT Count(*)
> FROM   lineitem
>        JOIN orders
>          ON l_orderkey = o_orderkey
>             AND ( ( l_suppkey > 3
>                     AND o_custkey > 13 )
>                    OR ( l_suppkey > 1
>                         AND o_custkey > 11 ) )
>             AND l_partkey > 19;
> {code}
> {noformat}
> postgres=# EXPLAIN
> postgres-# SELECT Count(*)
> postgres-# FROM   lineitem,
> postgres-#        orders
> postgres-# WHERE  l_orderkey = o_orderkey
> postgres-#        AND ( ( l_suppkey > 3
> postgres(#                AND o_custkey > 13 )
> postgres(#               OR ( l_suppkey > 1
> postgres(#                    AND o_custkey > 11 ) )
> postgres-#        AND l_partkey > 19;
>                                                                QUERY PLAN
> -----------------------------------------------------------------------------------------------------------------------------------------
>  Aggregate  (cost=21.18..21.19 rows=1 width=8)
>    ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
>          Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
>          Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
>          ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
>                Filter: ((o_custkey > 13) OR (o_custkey > 11))
>          ->  Hash  (cost=10.53..10.53 rows=6 width=16)
>                ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
>                      Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
> (9 rows)
> postgres=# EXPLAIN
> postgres-# SELECT Count(*)
> postgres-# FROM   lineitem
> postgres-#        JOIN orders
> postgres-#          ON l_orderkey = o_orderkey
> postgres-#             AND ( ( l_suppkey > 3
> postgres(#                     AND o_custkey > 13 )
> postgres(#                    OR ( l_suppkey > 1
> postgres(#                         AND o_custkey > 11 ) )
> postgres-#             AND l_partkey > 19;
>                                                                QUERY PLAN
> -----------------------------------------------------------------------------------------------------------------------------------------
>  Aggregate  (cost=21.18..21.19 rows=1 width=8)
>    ->  Hash Join  (cost=10.60..21.17 rows=2 width=0)
>          Hash Cond: (orders.o_orderkey = lineitem.l_orderkey)
>          Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11)))
>          ->  Seq Scan on orders  (cost=0.00..10.45 rows=17 width=16)
>                Filter: ((o_custkey > 13) OR (o_custkey > 11))
>          ->  Hash  (cost=10.53..10.53 rows=6 width=16)
>                ->  Seq Scan on lineitem  (cost=0.00..10.53 rows=6 width=16)
>                      Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR (l_suppkey > 1)))
> (9 rows)
> {noformat}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org