You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@drill.apache.org by "Rahul Challapalli (JIRA)" <ji...@apache.org> on 2014/12/13 01:02:14 UTC

[jira] [Created] (DRILL-1861) Optimizer selects hash aggregate even when we have a sorted dataset

Rahul Challapalli created DRILL-1861:
----------------------------------------

             Summary: Optimizer selects hash aggregate even when we have a sorted dataset
                 Key: DRILL-1861
                 URL: https://issues.apache.org/jira/browse/DRILL-1861
             Project: Apache Drill
          Issue Type: Bug
          Components: Execution - Operators, Query Planning & Optimization
            Reporter: Rahul Challapalli


git.commit.id.abbrev=142e577

Query :
{code}
select max(length(sub.str_var)) from ( select str_var, tinyint_var from `wide-strings` order by tinyint_var) sub group by sub.tinyint_var;
{code}

Plan :
{code}
+------------+------------+
|    text    |    json    |
+------------+------------+
| 00-00    Screen
00-01      Project(EXPR$0=[$1])
00-02        HashAgg(group=[{0}], EXPR$0=[MAX($1)])
00-03          Project(tinyint_var=[$1], $f1=[LENGTH($0)])
00-04            SelectionVectorRemover
00-05              Sort(sort0=[$1], dir0=[ASC])
00-06                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings]], selectionRoot=/drill/testdata/data-shapes/wide-columns/flat/wide-strings, numFiles=1, columns=[`str_var`, `tinyint_var`]]])
 | {
  "head" : {
    "version" : 1,
    "generator" : {
      "type" : "ExplainHandler",
      "info" : ""
    },
    "type" : "APACHE_DRILL_PHYSICAL",
    "options" : [ ],
    "queue" : 0,
    "resultMode" : "EXEC"
  },
  "graph" : [ {
    "pop" : "parquet-scan",
    "@id" : 6,
    "entries" : [ {
      "path" : "maprfs:/drill/testdata/data-shapes/wide-columns/flat/wide-strings"
    } ],
    "storage" : {
      "type" : "file",
      "enabled" : true,
      "connection" : "maprfs:///",
      "workspaces" : {
        "root" : {
          "location" : "/",
          "writable" : false,
          "defaultInputFormat" : null
        },
        "tmp" : {
          "location" : "/tmp",
          "writable" : true,
          "defaultInputFormat" : "csv"
        },
        "drillTestDir" : {
          "location" : "/drill/testdata/",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirAmplab" : {
          "location" : "/drill/testdata/amplab",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirInformationSchema" : {
          "location" : "/drill/testdata/information-schema",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirUdfs" : {
          "location" : "/drill/testdata/udfs/",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirP1" : {
          "location" : "/drill/testdata/p1tests",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirTpch10Parquet" : {
          "location" : "/drill/testdata/tpch10",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "Join" : {
          "location" : "/drill/testdata/join",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "NoExtJson" : {
          "location" : "/drill/testdata/no-extension/json",
          "writable" : true,
          "defaultInputFormat" : "json"
        },
        "NoExtParquet" : {
          "location" : "/drill/testdata/no-extension/parquet",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "NoExtParquetNull" : {
          "location" : "/drill/testdata/no-extension/parquet",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "NoExtText" : {
          "location" : "/drill/testdata/no-extension/text",
          "writable" : true,
          "defaultInputFormat" : "psv"
        },
        "drillTestDirExchanges" : {
          "location" : "/drill/testdata/exchanges_test",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "TpcHMulti" : {
          "location" : "/drill/testdata/tpch-multi",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "TpcHMulti100" : {
          "location" : "/drill/testdata/SF100",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "TpcHMulti1" : {
          "location" : "/drill/testdata/tpch_SF1",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirExplicit" : {
          "location" : "/drill/testdata/explicit_cast",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirImplicit" : {
          "location" : "/drill/testdata/implicit_cast",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirImplicit1" : {
          "location" : "/drill/testdata/implicit_cast",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirTPCDS" : {
          "location" : "/user/root/tpcds/parquet",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "TPCDS" : {
          "location" : "/drill/testdata/tpcds",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillMondrian" : {
          "location" : "/user/root/mondrian",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirDatetime" : {
          "location" : "/drill/testdata/datetime/datasources",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirViews" : {
          "location" : "/drill/testdata/views/",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirNumerical" : {
          "location" : "/drill/testdata/numerical/",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "drillTestDirJson" : {
          "location" : "/drill/testdata/json_storage/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTestNewWS" : {
          "location" : "/drill/testdata/newWS/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTpch01Text" : {
          "location" : "/drill/testdata/Tpch0.01/text/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTpch01Json" : {
          "location" : "/drill/testdata/Tpch0.01/json/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTpch01Parquet" : {
          "location" : "/drill/testdata/Tpch0.01/parquet/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirConvert" : {
          "location" : "/drill/testdata/convert",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTpch100Text" : {
          "location" : "/drill/testdata/tpch100/text/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTpch100Parquet" : {
          "location" : "/drill/testdata/tpch100/parquet",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirAggregate1parquet" : {
          "location" : "/drill/testdata/tpcds/parquet/s1",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirAggregate1csv" : {
          "location" : "/drill/testdata/tpcds/csv/s1",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirAggregate1json" : {
          "location" : "/drill/testdata/tpcds/json/s1",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirMondrian" : {
          "location" : "/drill/testdata/mondrian",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "drillTestDirTpcdsImpalaSF1" : {
          "location" : "/drill/testdata/tpcds-impala-sf1",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "sandbox" : {
          "location" : "/sandbox",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "sandbox-logs" : {
          "location" : "/sandbox/flat",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        },
        "sandbox-json" : {
          "location" : "/sandbox/json",
          "writable" : true,
          "defaultInputFormat" : "parquet"
        }
      },
      "formats" : {
        "psv" : {
          "type" : "text",
          "extensions" : [ "tbl" ],
          "delimiter" : "|"
        },
        "dsv" : {
          "type" : "text",
          "extensions" : [ "dat" ],
          "delimiter" : "|"
        },
        "csv" : {
          "type" : "text",
          "extensions" : [ "csv" ],
          "delimiter" : ","
        },
        "tsv" : {
          "type" : "text",
          "extensions" : [ "tsv" ],
          "delimiter" : "\t"
        },
        "parquet" : {
          "type" : "parquet"
        },
        "json" : {
          "type" : "json"
        }
      }
    },
    "format" : {
      "type" : "parquet"
    },
    "columns" : [ "`str_var`", "`tinyint_var`" ],
    "selectionRoot" : "/drill/testdata/data-shapes/wide-columns/flat/wide-strings",
    "cost" : 50.0
  }, {
    "pop" : "external-sort",
    "@id" : 5,
    "child" : 6,
    "orderings" : [ {
      "order" : "ASC",
      "expr" : "`tinyint_var`",
      "nullDirection" : "UNSPECIFIED"
    } ],
    "reverse" : false,
    "initialAllocation" : 20000000,
    "maxAllocation" : 10000000000,
    "cost" : 50.0
  }, {
    "pop" : "selection-vector-remover",
    "@id" : 4,
    "child" : 5,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 50.0
  }, {
    "pop" : "project",
    "@id" : 3,
    "exprs" : [ {
      "ref" : "`tinyint_var`",
      "expr" : "`tinyint_var`"
    }, {
      "ref" : "`$f1`",
      "expr" : "length(`str_var`) "
    } ],
    "child" : 4,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 50.0
  }, {
    "pop" : "hash-aggregate",
    "@id" : 2,
    "child" : 3,
    "cardinality" : 1.0,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "groupByExprs" : [ {
      "ref" : "`tinyint_var`",
      "expr" : "`tinyint_var`"
    } ],
    "aggrExprs" : [ {
      "ref" : "`EXPR$0`",
      "expr" : "max(`$f1`) "
    } ],
    "cost" : 25.0
  }, {
    "pop" : "project",
    "@id" : 1,
    "ex |
+------------+------------+
1 row selected (0.087 seconds)
{code}

The sub-query actually does an order by. So when we do a group by, it makes more sense cost-wise to use the streaming aggregate instead of hash-aggregate.

I attached the parquet file used. Let me know if you have any questions.




--
This message was sent by Atlassian JIRA
(v6.3.4#6332)