You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by "Rahul Challapalli (JIRA)" <ji...@apache.org> on 2014/12/13 01:02:14 UTC
[jira] [Created] (DRILL-1861) Optimizer selects hash aggregate even
when we have a sorted dataset
Rahul Challapalli created DRILL-1861:
----------------------------------------
Summary: Optimizer selects hash aggregate even when we have a sorted dataset
Key: DRILL-1861
URL: https://issues.apache.org/jira/browse/DRILL-1861
Project: Apache Drill
Issue Type: Bug
Components: Execution - Operators, Query Planning & Optimization
Reporter: Rahul Challapalli
git.commit.id.abbrev=142e577
Query :
{code}
select max(length(sub.str_var)) from ( select str_var, tinyint_var from `wide-strings` order by tinyint_var) sub group by sub.tinyint_var;
{code}
Plan :
{code}
+------------+------------+
| text | json |
+------------+------------+
| 00-00 Screen
00-01 Project(EXPR$0=[$1])
00-02 HashAgg(group=[{0}], EXPR$0=[MAX($1)])
00-03 Project(tinyint_var=[$1], $f1=[LENGTH($0)])
00-04 SelectionVectorRemover
00-05 Sort(sort0=[$1], dir0=[ASC])
00-06 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings]], selectionRoot=/drill/testdata/data-shapes/wide-columns/flat/wide-strings, numFiles=1, columns=[`str_var`, `tinyint_var`]]])
| {
"head" : {
"version" : 1,
"generator" : {
"type" : "ExplainHandler",
"info" : ""
},
"type" : "APACHE_DRILL_PHYSICAL",
"options" : [ ],
"queue" : 0,
"resultMode" : "EXEC"
},
"graph" : [ {
"pop" : "parquet-scan",
"@id" : 6,
"entries" : [ {
"path" : "maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings"
} ],
"storage" : {
"type" : "file",
"enabled" : true,
"connection" : "maprfs:///",
"workspaces" : {
"root" : {
"location" : "/",
"writable" : false,
"defaultInputFormat" : null
},
"tmp" : {
"location" : "/tmp",
"writable" : true,
"defaultInputFormat" : "csv"
},
"drillTestDir" : {
"location" : "/drill/testdata/",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirAmplab" : {
"location" : "/drill/testdata/amplab",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirInformationSchema" : {
"location" : "/drill/testdata/information-schema",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirUdfs" : {
"location" : "/drill/testdata/udfs/",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirP1" : {
"location" : "/drill/testdata/p1tests",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirTpch10Parquet" : {
"location" : "/drill/testdata/tpch10",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"Join" : {
"location" : "/drill/testdata/join",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"NoExtJson" : {
"location" : "/drill/testdata/no-extension/json",
"writable" : true,
"defaultInputFormat" : "json"
},
"NoExtParquet" : {
"location" : "/drill/testdata/no-extension/parquet",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"NoExtParquetNull" : {
"location" : "/drill/testdata/no-extension/parquet",
"writable" : true,
"defaultInputFormat" : null
},
"NoExtText" : {
"location" : "/drill/testdata/no-extension/text",
"writable" : true,
"defaultInputFormat" : "psv"
},
"drillTestDirExchanges" : {
"location" : "/drill/testdata/exchanges_test",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"TpcHMulti" : {
"location" : "/drill/testdata/tpch-multi",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"TpcHMulti100" : {
"location" : "/drill/testdata/SF100",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"TpcHMulti1" : {
"location" : "/drill/testdata/tpch_SF1",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirExplicit" : {
"location" : "/drill/testdata/explicit_cast",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirImplicit" : {
"location" : "/drill/testdata/implicit_cast",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirImplicit1" : {
"location" : "/drill/testdata/implicit_cast",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirTPCDS" : {
"location" : "/user/root/tpcds/parquet",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"TPCDS" : {
"location" : "/drill/testdata/tpcds",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillMondrian" : {
"location" : "/user/root/mondrian",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirDatetime" : {
"location" : "/drill/testdata/datetime/datasources",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirViews" : {
"location" : "/drill/testdata/views/",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirNumerical" : {
"location" : "/drill/testdata/numerical/",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"drillTestDirJson" : {
"location" : "/drill/testdata/json_storage/",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTestNewWS" : {
"location" : "/drill/testdata/newWS/",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTpch01Text" : {
"location" : "/drill/testdata/Tpch0.01/text/",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTpch01Json" : {
"location" : "/drill/testdata/Tpch0.01/json/",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTpch01Parquet" : {
"location" : "/drill/testdata/Tpch0.01/parquet/",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirConvert" : {
"location" : "/drill/testdata/convert",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTpch100Text" : {
"location" : "/drill/testdata/tpch100/text/",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTpch100Parquet" : {
"location" : "/drill/testdata/tpch100/parquet",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirAggregate1parquet" : {
"location" : "/drill/testdata/tpcds/parquet/s1",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirAggregate1csv" : {
"location" : "/drill/testdata/tpcds/csv/s1",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirAggregate1json" : {
"location" : "/drill/testdata/tpcds/json/s1",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirMondrian" : {
"location" : "/drill/testdata/mondrian",
"writable" : true,
"defaultInputFormat" : null
},
"drillTestDirTpcdsImpalaSF1" : {
"location" : "/drill/testdata/tpcds-impala-sf1",
"writable" : true,
"defaultInputFormat" : null
},
"sandbox" : {
"location" : "/sandbox",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"sandbox-logs" : {
"location" : "/sandbox/flat",
"writable" : true,
"defaultInputFormat" : "parquet"
},
"sandbox-json" : {
"location" : "/sandbox/json",
"writable" : true,
"defaultInputFormat" : "parquet"
}
},
"formats" : {
"psv" : {
"type" : "text",
"extensions" : [ "tbl" ],
"delimiter" : "|"
},
"dsv" : {
"type" : "text",
"extensions" : [ "dat" ],
"delimiter" : "|"
},
"csv" : {
"type" : "text",
"extensions" : [ "csv" ],
"delimiter" : ","
},
"tsv" : {
"type" : "text",
"extensions" : [ "tsv" ],
"delimiter" : "\t"
},
"parquet" : {
"type" : "parquet"
},
"json" : {
"type" : "json"
}
}
},
"format" : {
"type" : "parquet"
},
"columns" : [ "`str_var`", "`tinyint_var`" ],
"selectionRoot" : "/drill/testdata/data-shapes/wide-columns/flat/wide-strings",
"cost" : 50.0
}, {
"pop" : "external-sort",
"@id" : 5,
"child" : 6,
"orderings" : [ {
"order" : "ASC",
"expr" : "`tinyint_var`",
"nullDirection" : "UNSPECIFIED"
} ],
"reverse" : false,
"initialAllocation" : 20000000,
"maxAllocation" : 10000000000,
"cost" : 50.0
}, {
"pop" : "selection-vector-remover",
"@id" : 4,
"child" : 5,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 50.0
}, {
"pop" : "project",
"@id" : 3,
"exprs" : [ {
"ref" : "`tinyint_var`",
"expr" : "`tinyint_var`"
}, {
"ref" : "`$f1`",
"expr" : "length(`str_var`) "
} ],
"child" : 4,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 50.0
}, {
"pop" : "hash-aggregate",
"@id" : 2,
"child" : 3,
"cardinality" : 1.0,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"groupByExprs" : [ {
"ref" : "`tinyint_var`",
"expr" : "`tinyint_var`"
} ],
"aggrExprs" : [ {
"ref" : "`EXPR$0`",
"expr" : "max(`$f1`) "
} ],
"cost" : 25.0
}, {
"pop" : "project",
"@id" : 1,
"ex |
+------------+------------+
1 row selected (0.087 seconds)
{code}
The sub-query actually does an order by. So when we do a group by, it makes more sense cost-wise to use the streaming aggregate instead of hash-aggregate.
I attached the parquet file used. Let me know if you have any questions.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)