You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@drill.apache.org by "Victoria Markman (JIRA)" <ji...@apache.org> on 2015/03/04 00:26:04 UTC
[jira] [Created] (DRILL-2369) It takes too long to plan inner join query with 10 join predicates

Victoria Markman created DRILL-2369:
---------------------------------------

             Summary: It takes too long to plan inner join query with 10 join predicates
                 Key: DRILL-2369
                 URL: https://issues.apache.org/jira/browse/DRILL-2369
             Project: Apache Drill
          Issue Type: Bug
          Components: Query Planning & Optimization
            Reporter: Victoria Markman
            Assignee: Jinfeng Ni


Inner join with 10 join predicates takes 11 seconds to plan.
If number of predicates goes up to 14, planning time increases to 41 seconds.

(1) ON clause with 10 columns in join condition

-- Total elapsed time : 11.407 seconds
-- Planning time      : 11.03 seconds

{code}
select
        count(*)
from
        alltypes_with_nulls a
        INNER JOIN
        alltypes_with_nulls b ON
        (
        a.c_boolean = b.c_boolean
        AND a.c_timestamp = b.c_timestamp
        AND a.c_time = b.c_time
        AND a.c_date = b.c_date
        AND a.c_float = b.c_float
        AND a.c_bigdecimal = b.c_bigdecimal
        AND a.c_smalldecimal = b.c_smalldecimal
        AND a.c_bigint = b.c_bigint
        AND a.c_integer = b.c_integer
        AND a.c_varchar = b.c_varchar
        )
;
{code}

Explain plan:

{code}
00-01      StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02        HashAgg(group=[{0}])
00-03          Project(c_date=[$3])
00-04            HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4, $14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner])
00-06              Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-08                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
00-05              Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9])
00-07                Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-09                  Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
{code}

(2) 10 columns in JOIN condition, in the WHERE clause (just to check that nothing funny is going on with ON clause)

-- Total elapsed time : 11.139 seconds
-- Planning time      : 11.416 seconds

{code}
select
        count(*)
from
        alltypes_with_nulls a,
        alltypes_with_nulls b
where
        a.c_boolean = b.c_boolean
        AND a.c_timestamp = b.c_timestamp
        AND a.c_time = b.c_time
        AND a.c_date = b.c_date
        AND a.c_float = b.c_float
        AND a.c_bigdecimal = b.c_bigdecimal
        AND a.c_smalldecimal = b.c_smalldecimal
        AND a.c_bigint = b.c_bigint
        AND a.c_integer = b.c_integer
        AND a.c_varchar = b.c_varchar
;
{code}

Explain plan:

{code}
00-01      StreamAgg(group=[{}], EXPR$0=[COUNT()])
00-02        Project($f0=[0])
00-03          HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4, $14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner])
00-05            Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-07              Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
00-04            Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9])
00-06              Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-08                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]])
{code}

(3) 14 columns in join condition

-- Total elapsed time : 41.101 seconds
-- Planning time        : 39.771 seconds

{code}
select
        count(distinct a.c_date)
from
        alltypes_with_nulls a
        INNER JOIN
        alltypes_with_nulls b ON
        (
        a.c_varchar = b.c_varchar
        AND a.c_integer = b.c_integer
        AND a.c_bigint = b.c_bigint
        AND a.c_smalldecimal = b.c_smalldecimal
        AND a.c_bigdecimal = b.c_bigdecimal
        AND a.c_float = b.c_float
        AND a.c_date = b.c_date
        AND a.c_time = b.c_time
        AND a.c_timestamp = b.c_timestamp
        AND a.c_boolean = b.c_boolean
        AND a.d9 = b.d9
        AND a.d18 = b.d18
        AND a.d28 = b.d28
        AND a.d38 = b.d38
        )
;
{code}

Explain plan:
{code}
00-01      StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02        HashAgg(group=[{0}])
00-03          Project(c_date=[$6])
00-04            HashJoin(condition=[AND(=($0, $14), =($1, $15), =($2, $16), =($3, $17), =($4, $18), =($5, $19), =($6, $20), =($7, $21), =($8, $22), =($9, $23), =($10, $24), =($11, $25), =($12, $26), =($13, $27))], joinType=[inner])
00-06              Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6], c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7], d9=[$13], d18=[$8], d28=[$0], d38=[$12])
00-08                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`, `c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`, `c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]])
00-05              Project(c_varchar0=[$0], c_integer0=[$1], c_bigint0=[$2], c_smalldecimal0=[$3], c_bigdecimal0=[$4], c_float0=[$5], c_date0=[$6], c_time0=[$7], c_timestamp0=[$8], c_boolean0=[$9], d90=[$10], d180=[$11], d280=[$12], d380=[$13])
00-07                Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6], c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7], d9=[$13], d18=[$8], d28=[$0], d38=[$12])
00-09                  Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`, `c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`, `c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]])
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)