You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/05/24 21:40:57 UTC
incubator-impala git commit: IMPALA-3450: LIMITs on plan nodes are
reflected in cardinality estimates
Repository: incubator-impala
Updated Branches:
refs/heads/master f413e236a -> d70ffa455
IMPALA-3450: LIMITs on plan nodes are reflected in cardinality estimates
PlanNode includes a 'capAtLimit()' method that can be used in
'computeStats()' on PlanNodes to ensure they do not estimate their
cardinality to be more than a pushed-down LIMIT clause.
This patch ensures that 'capAtLimit()' is used in all of the relevant
classes descending from PlanNode.
Change-Id: Ic06dcb93bbb2510c0d40151302bd817ef340b825
Reviewed-on: http://gerrit.cloudera.org:8080/3127
Reviewed-by: Jim Apple <jb...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d70ffa45
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d70ffa45
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d70ffa45
Branch: refs/heads/master
Commit: d70ffa455d48bb5ffb26c9eb065a40374c369d8c
Parents: f413e23
Author: Jim Apple <jb...@cloudera.com>
Authored: Wed May 18 13:20:40 2016 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Tue May 24 14:40:52 2016 -0700
----------------------------------------------------------------------
.../impala/planner/AggregationNode.java | 1 +
.../impala/planner/AnalyticEvalNode.java | 1 +
.../com/cloudera/impala/planner/JoinNode.java | 2 +-
.../com/cloudera/impala/planner/SelectNode.java | 1 +
.../cloudera/impala/planner/SubplanNode.java | 1 +
.../com/cloudera/impala/planner/UnionNode.java | 2 +-
.../com/cloudera/impala/planner/UnnestNode.java | 1 +
.../impala/planner/PlannerTestBase.java | 33 ++++++
.../queries/PlannerTest/aggregation.test | 2 +-
.../queries/PlannerTest/inline-view-limit.test | 13 +++
.../queries/PlannerTest/join-order.test | 32 +++---
.../queries/PlannerTest/joins.test | 111 +++++++++++++++++++
.../queries/PlannerTest/nested-collections.test | 2 +-
.../queries/PlannerTest/union.test | 28 +++++
14 files changed, 210 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java b/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java
index 5600bf7..ae6a967 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/AggregationNode.java
@@ -196,6 +196,7 @@ public class AggregationNode extends PlanNode {
cardinality_ = Math.min(getChild(0).getCardinality(), cardinality_);
}
}
+ cardinality_ = capAtLimit(cardinality_);
LOG.trace("stats Agg: cardinality=" + Long.toString(cardinality_));
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java b/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java
index 3ee6be6..653e0a1 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/AnalyticEvalNode.java
@@ -126,6 +126,7 @@ public class AnalyticEvalNode extends PlanNode {
protected void computeStats(Analyzer analyzer) {
super.computeStats(analyzer);
cardinality_ = getChild(0).cardinality_;
+ cardinality_ = capAtLimit(cardinality_);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java b/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java
index 34de765..6d60e43 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/JoinNode.java
@@ -445,7 +445,7 @@ public abstract class JoinNode extends PlanNode {
break;
}
}
-
+ cardinality_ = capAtLimit(cardinality_);
Preconditions.checkState(hasValidStats());
LOG.debug("stats Join: cardinality=" + Long.toString(cardinality_));
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java b/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java
index e6d7bf1..ae622c3 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/SelectNode.java
@@ -63,6 +63,7 @@ public class SelectNode extends PlanNode {
Math.round(((double) getChild(0).cardinality_) * computeSelectivity());
Preconditions.checkState(cardinality_ >= 0);
}
+ cardinality_ = capAtLimit(cardinality_);
LOG.debug("stats Select: cardinality=" + Long.toString(cardinality_));
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java b/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java
index 969ff27..3b9070e 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/SubplanNode.java
@@ -74,6 +74,7 @@ public class SubplanNode extends PlanNode {
} else {
cardinality_ = -1;
}
+ cardinality_ = capAtLimit(cardinality_);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java b/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java
index 171451f..9c2ebf2 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/UnionNode.java
@@ -98,7 +98,7 @@ public class UnionNode extends PlanNode {
// are inline views (e.g. select 1 FROM (VALUES(1 x, 1 y)) a FULL OUTER JOIN
// (VALUES(1 x, 1 y)) b ON (a.x = b.y)). We need to set the correct value.
if (numNodes_ == -1) numNodes_ = 1;
-
+ cardinality_ = capAtLimit(cardinality_);
LOG.debug("stats Union: cardinality=" + Long.toString(cardinality_));
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java b/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java
index a817b03..0a0ee1b 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/UnnestNode.java
@@ -67,6 +67,7 @@ public class UnnestNode extends PlanNode {
// The containing SubplanNode has not yet been initialized, so get the number
// of nodes from the SubplanNode's input.
numNodes_ = containingSubplanNode_.getChild(0).getNumNodes();
+ cardinality_ = capAtLimit(cardinality_);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java b/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java
index ce8980e..561032d 100644
--- a/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java
+++ b/fe/src/test/java/com/cloudera/impala/planner/PlannerTestBase.java
@@ -398,6 +398,7 @@ public class PlannerTestBase {
testPlan(testCase, Section.PLAN, queryCtx, errorLog, actualOutput);
checkScanRangeLocations(testCase, singleNodeExecRequest, errorLog, actualOutput);
checkColumnLineage(testCase, singleNodeExecRequest, errorLog, actualOutput);
+ checkLimitCardinality(query, singleNodeExecRequest, errorLog);
// Test distributed plan.
testPlan(testCase, Section.DISTRIBUTEDPLAN, queryCtx, errorLog, actualOutput);
// test parallel plans
@@ -550,6 +551,38 @@ public class PlannerTestBase {
}
}
+ /** Checks that limits are accounted for in the cardinality of plan nodes.
+ */
+ private void checkLimitCardinality(
+ String query, TExecRequest execRequest, StringBuilder errorLog) {
+ if (execRequest == null) return;
+ if (!execRequest.isSetQuery_exec_request()
+ || execRequest.query_exec_request == null) {
+ return;
+ }
+ for (TPlanFragment planFragment : execRequest.query_exec_request.fragments) {
+ if (!planFragment.isSetPlan() || planFragment.plan == null) continue;
+ for (TPlanNode node : planFragment.plan.nodes) {
+ if (!node.isSetLimit() || -1 == node.limit) continue;
+ if (!node.isSetEstimated_stats() || node.estimated_stats == null) continue;
+ if (node.limit < node.estimated_stats.cardinality) {
+ StringBuilder limitCardinalityError = new StringBuilder();
+ limitCardinalityError.append("Query: " + query + "\n");
+ limitCardinalityError.append(
+ "Expected cardinality estimate less than or equal to LIMIT: "
+ + node.limit + "\n");
+ limitCardinalityError.append(
+ "Actual cardinality estimate: "
+ + node.estimated_stats.cardinality + "\n");
+ limitCardinalityError.append(
+ "In node id "
+ + node.node_id + "\n");
+ errorLog.append(limitCardinalityError.toString());
+ }
+ }
+ }
+ }
+
private void checkColumnLineage(TestCase testCase, TExecRequest execRequest,
StringBuilder errorLog, StringBuilder actualOutput) {
String query = testCase.getQuery();
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
index 250f85b..47bfb23 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
@@ -1062,4 +1062,4 @@ limit 10
00:SCAN HDFS [tpch_parquet.lineitem]
partitions=1/1 files=3 size=195.85MB
runtime filters: RF002 -> l_orderkey, RF003 -> l_returnflag
-====
+====
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
index 98bc3a1..8d3a816 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
@@ -613,3 +613,16 @@ where a.id > 10 and b.id > 20
predicates: id != 1, functional.alltypes.id != 2, functional.alltypes.id > 10, functional.alltypes.id > 20
runtime filters: RF000 -> id
====
+# IMPALA-3450: limits on select nodes are reflected in cardinality estimates. The test for
+# this is embedded in PlannerTestBase.java and is not visible in these plans, as they only
+# have explain_level=1
+select * from (select * from functional.alltypes limit 100) v where id < 10 limit 1
+---- PLAN
+01:SELECT
+| predicates: functional.alltypes.id < 10
+| limit: 1
+|
+00:SCAN HDFS [functional.alltypes]
+ partitions=24/24 files=24 size=478.45KB
+ limit: 100
+====
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
index ab2b23c..441afca 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
@@ -1275,12 +1275,6 @@ WHERE `$a$2`.`$c$1` > t4.id
08:NESTED LOOP JOIN [INNER JOIN]
| predicates: sum(t1.int_col) > t4.id
|
-|--00:SCAN HDFS [functional.alltypestiny t4]
-| partitions=4/4 files=4 size=460B
-| runtime filters: RF000 -> t4.bigint_col
-|
-07:NESTED LOOP JOIN [CROSS JOIN]
-|
|--05:AGGREGATE [FINALIZE]
| | output: sum(t1.int_col)
| | limit: 1
@@ -1288,17 +1282,23 @@ WHERE `$a$2`.`$c$1` > t4.id
| 04:SCAN HDFS [functional.alltypesagg t1]
| partitions=11/11 files=11 size=814.73KB
|
-03:HASH JOIN [INNER JOIN]
-| hash predicates: t1.bigint_col = t2.smallint_col
-| runtime filters: RF001 <- t2.smallint_col
-| limit: 1
+07:NESTED LOOP JOIN [CROSS JOIN]
|
-|--02:SCAN HDFS [functional.alltypestiny t2]
-| partitions=4/4 files=4 size=460B
+|--03:HASH JOIN [INNER JOIN]
+| | hash predicates: t1.bigint_col = t2.smallint_col
+| | runtime filters: RF001 <- t2.smallint_col
+| | limit: 1
+| |
+| |--02:SCAN HDFS [functional.alltypestiny t2]
+| | partitions=4/4 files=4 size=460B
+| |
+| 01:SCAN HDFS [functional.alltypes t1]
+| partitions=24/24 files=24 size=478.45KB
+| runtime filters: RF001 -> t1.bigint_col
|
-01:SCAN HDFS [functional.alltypes t1]
- partitions=24/24 files=24 size=478.45KB
- runtime filters: RF001 -> t1.bigint_col
+00:SCAN HDFS [functional.alltypestiny t4]
+ partitions=4/4 files=4 size=460B
+ runtime filters: RF000 -> t4.bigint_col
====
# Tests assignment of conjuncts to inverted outer joins (IMPALA-1342).
select 1
@@ -1498,4 +1498,4 @@ and a.timestamp_col <=> now()
partitions=24/24 files=24 size=478.45KB
predicates: date_string_col IS NOT DISTINCT FROM ''
runtime filters: RF000 -> functional.alltypes.id
-====
+====
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
index 6280bf5..5d0e892 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
@@ -2199,3 +2199,114 @@ and t3.a != t2.g
00:SCAN HDFS [functional.nulltable t1]
partitions=1/1 files=1 size=18B
====
+# IMPALA-3450: limits on join nodes are reflected in cardinality estimates. The test for
+# this is embedded in PlannerTestBase.java and is not visible in these plans, as they only
+# have explain_level=1
+select a.c_custkey as c_custkey from tpch.customer a, tpch.customer b limit 1
+---- PLAN
+02:NESTED LOOP JOIN [CROSS JOIN]
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+====
+select a.c_custkey as c_custkey from tpch.customer a left semi join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [LEFT SEMI JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| runtime filters: RF000 <- b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+ runtime filters: RF000 -> a.c_custkey
+====
+select b.c_custkey as c_custkey from tpch.customer a right semi join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [RIGHT SEMI JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| runtime filters: RF000 <- b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+ runtime filters: RF000 -> a.c_custkey
+====
+select a.c_custkey as c_custkey from tpch.customer a left outer join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [LEFT OUTER JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+====
+select b.c_custkey as c_custkey from tpch.customer a right outer join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [RIGHT OUTER JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| runtime filters: RF000 <- b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+ runtime filters: RF000 -> a.c_custkey
+====
+select a.c_custkey as c_custkey from tpch.customer a full outer join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [FULL OUTER JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+====
+select a.c_custkey as c_custkey from tpch.customer a left anti join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [LEFT ANTI JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+====
+select b.c_custkey as c_custkey from tpch.customer a right anti join tpch.customer b
+using (c_custkey) limit 1
+---- PLAN
+02:HASH JOIN [RIGHT ANTI JOIN]
+| hash predicates: a.c_custkey = b.c_custkey
+| limit: 1
+|
+|--01:SCAN HDFS [tpch.customer b]
+| partitions=1/1 files=1 size=23.08MB
+|
+00:SCAN HDFS [tpch.customer a]
+ partitions=1/1 files=1 size=23.08MB
+====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
index 4ab84b3..ee0bce4 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
@@ -1772,4 +1772,4 @@ left semi join c2.c_orders o2
|
00:SCAN HDFS [tpch_nested_parquet.customer c1]
partitions=1/1 files=4 size=577.87MB
-====
+====
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d70ffa45/testdata/workloads/functional-planner/queries/PlannerTest/union.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/union.test b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
index d87c351..86c59c7 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
@@ -2661,3 +2661,31 @@ select 1000, 2000
06:SCAN HDFS [functional.alltypes]
partitions=1/24 files=1 size=18.12KB
====
+# IMPALA-3450: limits on union nodes are reflected in cardinality estimates. The test for
+# this is embedded in PlannerTestBase.java and is not visible in these plans, as they only
+# have explain_level=1
+select * from tpch.lineitem UNION ALL (select * from tpch.lineitem) LIMIT 1
+---- PLAN
+00:UNION
+| limit: 1
+|
+|--02:SCAN HDFS [tpch.lineitem]
+| partitions=1/1 files=1 size=718.94MB
+|
+01:SCAN HDFS [tpch.lineitem]
+ partitions=1/1 files=1 size=718.94MB
+====
+select l_orderkey from tpch.lineitem UNION DISTINCT (select l_orderkey from tpch.lineitem) LIMIT 1
+---- PLAN
+03:AGGREGATE [FINALIZE]
+| group by: l_orderkey
+| limit: 1
+|
+00:UNION
+|
+|--02:SCAN HDFS [tpch.lineitem]
+| partitions=1/1 files=1 size=718.94MB
+|
+01:SCAN HDFS [tpch.lineitem]
+ partitions=1/1 files=1 size=718.94MB
+====
\ No newline at end of file