You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2015/05/06 09:00:31 UTC
[2/2] hive git commit: HIVE-10607 : Combination of ReducesinkDedup +
TopN optimization yields incorrect result if there are multiple GBY in
reducer (Ashutosh Chauhan via Sergey Shelukhin)
HIVE-10607 : Combination of ReducesinkDedup + TopN optimization yields incorrect result if there are multiple GBY in reducer (Ashutosh Chauhan via Sergey Shelukhin)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/402b8ae8
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/402b8ae8
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/402b8ae8
Branch: refs/heads/branch-1.2
Commit: 402b8ae843a9d00d90ad8c0680976b0e30d6e24b
Parents: 915c639
Author: Ashutosh Chauhan <ha...@apache.org>
Authored: Mon May 4 22:25:12 2015 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Wed May 6 00:00:05 2015 -0700
----------------------------------------------------------------------
.../ql/optimizer/LimitPushdownOptimizer.java | 9 +-
.../queries/clientpositive/limit_pushdown.q | 4 +
.../results/clientpositive/limit_pushdown.q.out | 88 ++++++++++++++++++
.../clientpositive/spark/limit_pushdown.q.out | 94 ++++++++++++++++++++
.../clientpositive/tez/limit_pushdown.q.out | 94 ++++++++++++++++++++
5 files changed, 288 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/402b8ae8/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
index f80941e..e850550 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.LimitOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
@@ -86,6 +87,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException;
*/
public class LimitPushdownOptimizer implements Transform {
+ @Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("R1",
@@ -105,6 +107,7 @@ public class LimitPushdownOptimizer implements Transform {
private static class TopNReducer implements NodeProcessor {
+ @Override
public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator rs = null;
@@ -122,6 +125,10 @@ public class LimitPushdownOptimizer implements Transform {
}
}
if (rs != null) {
+ if (OperatorUtils.findOperators(rs, GroupByOperator.class).size() > 1){
+ // Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See HIVE-10607 for more.
+ return false;
+ }
LimitOperator limit = (LimitOperator) nd;
rs.getConf().setTopN(limit.getConf().getLimit());
rs.getConf().setTopNMemoryUsage(((LimitPushdownContext) procCtx).threshold);
@@ -135,7 +142,7 @@ public class LimitPushdownOptimizer implements Transform {
private static class LimitPushdownContext implements NodeProcessorCtx {
- private float threshold;
+ private final float threshold;
public LimitPushdownContext(HiveConf conf) throws SemanticException {
threshold = conf.getFloatVar(HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE);
http://git-wip-us.apache.org/repos/asf/hive/blob/402b8ae8/ql/src/test/queries/clientpositive/limit_pushdown.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/limit_pushdown.q b/ql/src/test/queries/clientpositive/limit_pushdown.q
index d93e246..3940564 100644
--- a/ql/src/test/queries/clientpositive/limit_pushdown.q
+++ b/ql/src/test/queries/clientpositive/limit_pushdown.q
@@ -31,6 +31,10 @@ explain
select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20;
select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20;
+explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20;
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20;
+
-- multi distinct
explain
select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20;
http://git-wip-us.apache.org/repos/asf/hive/blob/402b8ae8/ql/src/test/results/clientpositive/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/limit_pushdown.q.out b/ql/src/test/results/clientpositive/limit_pushdown.q.out
index c7ab7b3..6ace047 100644
--- a/ql/src/test/results/clientpositive/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/limit_pushdown.q.out
@@ -504,6 +504,94 @@ POSTHOOK: Input: default@alltypesorc
-63 19
-64 24
NULL 2932
+PREHOOK: query: explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: alltypesorc
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: ctinyint (type: tinyint), cdouble (type: double)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col0 (type: tinyint), _col1 (type: double)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: tinyint), _col1 (type: double)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: tinyint)
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(_col1)
+ keys: _col0 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 20
+ Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 20
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46 24
+-47 22
+-48 29
+-49 26
+-50 30
+-51 21
+-52 33
+-53 22
+-54 26
+-55 29
+-56 36
+-57 35
+-58 23
+-59 31
+-60 27
+-61 25
+-62 27
+-63 19
+-64 24
+NULL 2932
PREHOOK: query: -- multi distinct
explain
select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20
http://git-wip-us.apache.org/repos/asf/hive/blob/402b8ae8/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
index 01106a4..40af253 100644
--- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
@@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc
-63 19
-64 24
NULL 2932
+PREHOOK: query: explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: alltypesorc
+ Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: ctinyint (type: tinyint), cdouble (type: double)
+ outputColumnNames: ctinyint, cdouble
+ Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: ctinyint (type: tinyint), cdouble (type: double)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: tinyint), _col1 (type: double)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: tinyint)
+ Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+ Reducer 2
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6144 Data size: 188618 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(_col1)
+ keys: _col0 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 3072 Data size: 94309 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 20
+ Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 20
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46 24
+-47 22
+-48 29
+-49 26
+-50 30
+-51 21
+-52 33
+-53 22
+-54 26
+-55 29
+-56 36
+-57 35
+-58 23
+-59 31
+-60 27
+-61 25
+-62 27
+-63 19
+-64 24
+NULL 2932
PREHOOK: query: -- multi distinct
explain
select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20
http://git-wip-us.apache.org/repos/asf/hive/blob/402b8ae8/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
index 952d7ff..7038b4d 100644
--- a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
@@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc
-63 19
-64 24
NULL 2932
+PREHOOK: query: explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: alltypesorc
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: ctinyint (type: tinyint), cdouble (type: double)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col0 (type: tinyint), _col1 (type: double)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: tinyint), _col1 (type: double)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: tinyint)
+ Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+ Reducer 2
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(_col1)
+ keys: _col0 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 20
+ Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 20
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46 24
+-47 22
+-48 29
+-49 26
+-50 30
+-51 21
+-52 33
+-53 22
+-54 26
+-55 29
+-56 36
+-57 35
+-58 23
+-59 31
+-60 27
+-61 25
+-62 27
+-63 19
+-64 24
+NULL 2932
PREHOOK: query: -- multi distinct
explain
select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20