You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@drill.apache.org by "Victoria Markman (JIRA)" <ji...@apache.org> on 2015/03/04 00:26:04 UTC
[jira] [Created] (DRILL-2369) It takes too long to plan inner join
query with 10 join predicates
Victoria Markman created DRILL-2369:
---------------------------------------
Summary: It takes too long to plan inner join query with 10 join predicates
Key: DRILL-2369
URL: https://issues.apache.org/jira/browse/DRILL-2369
Project: Apache Drill
Issue Type: Bug
Components: Query Planning & Optimization
Reporter: Victoria Markman
Assignee: Jinfeng Ni
Inner join with 10 join predicates takes 11 seconds to plan.
If number of predicates goes up to 14, planning time increases to 41 seconds.
(1) ON clause with 10 columns in join condition
-- Total elapsed time : 11.407 seconds
-- Planning time : 11.03 seconds
{code}
select
count(*)
from
alltypes_with_nulls a
INNER JOIN
alltypes_with_nulls b ON
(
a.c_boolean = b.c_boolean
AND a.c_timestamp = b.c_timestamp
AND a.c_time = b.c_time
AND a.c_date = b.c_date
AND a.c_float = b.c_float
AND a.c_bigdecimal = b.c_bigdecimal
AND a.c_smalldecimal = b.c_smalldecimal
AND a.c_bigint = b.c_bigint
AND a.c_integer = b.c_integer
AND a.c_varchar = b.c_varchar
)
;
{code}
Explain plan:
{code}
00-01 StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02 HashAgg(group=[{0}])
00-03 Project(c_date=[$3])
00-04 HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4, $14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner])
00-06 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
00-05 Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9])
00-07 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-09 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
{code}
(2) 10 columns in JOIN condition, in the WHERE clause (just to check that nothing funny is going on with ON clause)
-- Total elapsed time : 11.139 seconds
-- Planning time : 11.416 seconds
{code}
select
count(*)
from
alltypes_with_nulls a,
alltypes_with_nulls b
where
a.c_boolean = b.c_boolean
AND a.c_timestamp = b.c_timestamp
AND a.c_time = b.c_time
AND a.c_date = b.c_date
AND a.c_float = b.c_float
AND a.c_bigdecimal = b.c_bigdecimal
AND a.c_smalldecimal = b.c_smalldecimal
AND a.c_bigint = b.c_bigint
AND a.c_integer = b.c_integer
AND a.c_varchar = b.c_varchar
;
{code}
Explain plan:
{code}
00-01 StreamAgg(group=[{}], EXPR$0=[COUNT()])
00-02 Project($f0=[0])
00-03 HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4, $14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner])
00-05 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-07 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
00-04 Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9])
00-06 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
{code}
(3) 14 columns in join condition
-- Total elapsed time : 41.101 seconds
-- Planning time : 39.771 seconds
{code}
select
count(distinct a.c_date)
from
alltypes_with_nulls a
INNER JOIN
alltypes_with_nulls b ON
(
a.c_varchar = b.c_varchar
AND a.c_integer = b.c_integer
AND a.c_bigint = b.c_bigint
AND a.c_smalldecimal = b.c_smalldecimal
AND a.c_bigdecimal = b.c_bigdecimal
AND a.c_float = b.c_float
AND a.c_date = b.c_date
AND a.c_time = b.c_time
AND a.c_timestamp = b.c_timestamp
AND a.c_boolean = b.c_boolean
AND a.d9 = b.d9
AND a.d18 = b.d18
AND a.d28 = b.d28
AND a.d38 = b.d38
)
;
{code}
Explain plan:
{code}
00-01 StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02 HashAgg(group=[{0}])
00-03 Project(c_date=[$6])
00-04 HashJoin(condition=[AND(=($0, $14), =($1, $15), =($2, $16), =($3, $17), =($4, $18), =($5, $19), =($6, $20), =($7, $21), =($8, $22), =($9, $23), =($10, $24), =($11, $25), =($12, $26), =($13, $27))], joinType=[inner])
00-06 Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6], c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7], d9=[$13], d18=[$8], d28=[$0], d38=[$12])
00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`, `c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`, `c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]])
00-05 Project(c_varchar0=[$0], c_integer0=[$1], c_bigint0=[$2], c_smalldecimal0=[$3], c_bigdecimal0=[$4], c_float0=[$5], c_date0=[$6], c_time0=[$7], c_timestamp0=[$8], c_boolean0=[$9], d90=[$10], d180=[$11], d280=[$12], d380=[$13])
00-07 Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6], c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7], d9=[$13], d18=[$8], d28=[$0], d38=[$12])
00-09 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`, `c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`, `c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]])
{code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)