You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/01/17 02:00:25 UTC

[2/2] impala git commit: IMPALA-6388: Fix the Union node number of hosts estimation

IMPALA-6388: Fix the Union node number of hosts estimation

Before this patch, we would estimate the number of hosts for the union
node by looking only at the first union operand. This is obviously
incorrect and lead us to underestimate the value.

We fix the problem by setting the estimate to be the maximum of its
children.

Testing:
- Added a planner test that reproduces the issue

Change-Id: I51e1ecca8dbc84b2b5a72708667b2799d00279f0
Reviewed-on: http://gerrit.cloudera.org:8080/9017
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/f8b40622
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/f8b40622
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/f8b40622

Branch: refs/heads/master
Commit: f8b406222de8f41765ef1d130e2debbd8ab06369
Parents: 4c43cac
Author: Taras Bobrovytsky <tb...@cloudera.com>
Authored: Thu Jan 11 17:01:07 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Jan 17 01:41:16 2018 +0000

----------------------------------------------------------------------
 .../org/apache/impala/planner/UnionNode.java    |  1 +
 .../queries/PlannerTest/union.test              | 88 ++++++++++++++++++++
 .../workloads/tpch/queries/insert_parquet.test  |  2 +
 .../tpch/queries/tpch-aggregations.test         |  2 +
 4 files changed, 93 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/fe/src/main/java/org/apache/impala/planner/UnionNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/UnionNode.java b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
index 302f62d..52ce508 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnionNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
@@ -117,6 +117,7 @@ public class UnionNode extends PlanNode {
       if (child.cardinality_ > 0) {
         cardinality_ = checkedAdd(cardinality_, child.cardinality_);
       }
+      numNodes_ = Math.max(child.getNumNodes(), numNodes_);
     }
     // The number of nodes of a union node is -1 (invalid) if all the referenced tables
     // are inline views (e.g. select 1 FROM (VALUES(1 x, 1 y)) a FULL OUTER JOIN

http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/testdata/workloads/functional-planner/queries/PlannerTest/union.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/union.test b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
index 4ebfb6b..a0d0c26 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
@@ -3215,3 +3215,91 @@ PLAN-ROOT SINK
 00:SCAN HDFS [tpch_nested_parquet.customer c]
    partitions=1/1 files=4 size=292.36MB
 ====
+# IMPALA-6388: Verify that the order of the union operands does not impact the
+# number of hosts computation
+select f2 from functional.emptytable
+union all
+select int_col from functional.alltypes
+---- QUERYOPTIONS
+explain_level=2
+---- DISTRIBUTEDPLAN
+F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=0B mem-reservation=0B
+PLAN-ROOT SINK
+|  mem-estimate=0B mem-reservation=0B
+|
+03:EXCHANGE [UNPARTITIONED]
+|  mem-estimate=0B mem-reservation=0B
+|  tuple-ids=2 row-size=4B cardinality=7300
+|
+F02:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
+Per-Host Resources: mem-estimate=128.00MB mem-reservation=0B
+00:UNION
+|  pass-through-operands: all
+|  mem-estimate=0B mem-reservation=0B
+|  tuple-ids=2 row-size=4B cardinality=7300
+|
+|--02:SCAN HDFS [functional.alltypes, RANDOM]
+|     partitions=24/24 files=24 size=478.45KB
+|     stored statistics:
+|       table: rows=7300 size=478.45KB
+|       partitions: 24/24 rows=7300
+|       columns: all
+|     extrapolated-rows=disabled
+|     mem-estimate=128.00MB mem-reservation=0B
+|     tuple-ids=1 row-size=4B cardinality=7300
+|
+01:SCAN HDFS [functional.emptytable, RANDOM]
+   partitions=0/0 files=0 size=0B
+   stored statistics:
+     table: rows=unavailable size=unavailable
+     partitions: 0/0 rows=unavailable
+     columns: all
+   extrapolated-rows=disabled
+   mem-estimate=0B mem-reservation=0B
+   tuple-ids=0 row-size=4B cardinality=0
+====
+# IMPALA-6388: Verify that the order of the union operands does not impact the
+# number of hosts computation
+select int_col from functional.alltypes
+union all
+select f2 from functional.emptytable
+---- QUERYOPTIONS
+explain_level=2
+---- DISTRIBUTEDPLAN
+F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=0B mem-reservation=0B
+PLAN-ROOT SINK
+|  mem-estimate=0B mem-reservation=0B
+|
+03:EXCHANGE [UNPARTITIONED]
+|  mem-estimate=0B mem-reservation=0B
+|  tuple-ids=2 row-size=4B cardinality=7300
+|
+F02:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
+Per-Host Resources: mem-estimate=128.00MB mem-reservation=0B
+00:UNION
+|  pass-through-operands: all
+|  mem-estimate=0B mem-reservation=0B
+|  tuple-ids=2 row-size=4B cardinality=7300
+|
+|--02:SCAN HDFS [functional.emptytable, RANDOM]
+|     partitions=0/0 files=0 size=0B
+|     stored statistics:
+|       table: rows=unavailable size=unavailable
+|       partitions: 0/0 rows=unavailable
+|       columns: all
+|     extrapolated-rows=disabled
+|     mem-estimate=0B mem-reservation=0B
+|     tuple-ids=1 row-size=4B cardinality=0
+|
+01:SCAN HDFS [functional.alltypes, RANDOM]
+   partitions=24/24 files=24 size=478.45KB
+   stored statistics:
+     table: rows=7300 size=478.45KB
+     partitions: 24/24 rows=7300
+     columns: all
+   extrapolated-rows=disabled
+   mem-estimate=128.00MB mem-reservation=0B
+   tuple-ids=0 row-size=4B cardinality=7300
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/testdata/workloads/tpch/queries/insert_parquet.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/queries/insert_parquet.test b/testdata/workloads/tpch/queries/insert_parquet.test
index 862548e..23c945d 100644
--- a/testdata/workloads/tpch/queries/insert_parquet.test
+++ b/testdata/workloads/tpch/queries/insert_parquet.test
@@ -57,6 +57,8 @@ bigint
 # Test to verify that huge (larger than 64k) values can be written, see IMPALA-1705
 create table if not exists test_insert_huge_vals (s string) stored as parquet
 location '$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/test_insert_huge_vals';
+# Large rows are possible in this query, so we bump up the max row size.
+set max_row_size=1048576;
 insert overwrite table test_insert_huge_vals
   select cast(l_orderkey as string) from tpch.lineitem
   union select group_concat(concat(s_name, s_address, s_phone)) from tpch.supplier

http://git-wip-us.apache.org/repos/asf/impala/blob/f8b40622/testdata/workloads/tpch/queries/tpch-aggregations.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/queries/tpch-aggregations.test b/testdata/workloads/tpch/queries/tpch-aggregations.test
index 462c3b8..58575ad 100644
--- a/testdata/workloads/tpch/queries/tpch-aggregations.test
+++ b/testdata/workloads/tpch/queries/tpch-aggregations.test
@@ -2,6 +2,8 @@
 ---- QUERY
 # Regression test for IMPALA-2352. Small buffers of streams get full and we
 # can switch to IO-buffers without spilling.
+# Large rows are possible in this query, so we bump up the max row size.
+set max_row_size=1048576;
 SELECT count(*) FROM
 (SELECT cast(l_orderkey as string) s FROM tpch.lineitem UNION
  SELECT group_concat(concat(s_name, s_address, s_phone)) FROM tpch.supplier) v