You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pr...@apache.org on 2015/05/07 03:21:35 UTC

[41/52] [abbrv] hive git commit: HIVE-10607 : Combination of ReducesinkDedup + TopN optimization yields incorrect result if there are multiple GBY in reducer (Ashutosh Chauhan via Sergey Shelukhin)

HIVE-10607 : Combination of ReducesinkDedup + TopN optimization yields incorrect result if there are multiple GBY in reducer (Ashutosh Chauhan via Sergey Shelukhin)

Signed-off-by: Ashutosh Chauhan <ha...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/c0116739
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/c0116739
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/c0116739

Branch: refs/heads/llap
Commit: c0116739972bcffcc65498eb721f6b8c1b8e305d
Parents: eefb071
Author: Ashutosh Chauhan <ha...@apache.org>
Authored: Mon May 4 22:25:12 2015 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Tue May 5 23:52:38 2015 -0700

----------------------------------------------------------------------
 .../ql/optimizer/LimitPushdownOptimizer.java    |  9 +-
 .../queries/clientpositive/limit_pushdown.q     |  4 +
 .../results/clientpositive/limit_pushdown.q.out | 88 ++++++++++++++++++
 .../clientpositive/spark/limit_pushdown.q.out   | 94 ++++++++++++++++++++
 .../clientpositive/tez/limit_pushdown.q.out     | 94 ++++++++++++++++++++
 5 files changed, 288 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
index f80941e..e850550 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.exec.GroupByOperator;
 import org.apache.hadoop.hive.ql.exec.LimitOperator;
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorUtils;
 import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
 import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
 import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
@@ -86,6 +87,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException;
  */
 public class LimitPushdownOptimizer implements Transform {
 
+  @Override
   public ParseContext transform(ParseContext pctx) throws SemanticException {
     Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
     opRules.put(new RuleRegExp("R1",
@@ -105,6 +107,7 @@ public class LimitPushdownOptimizer implements Transform {
 
   private static class TopNReducer implements NodeProcessor {
 
+    @Override
     public Object process(Node nd, Stack<Node> stack,
         NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
       ReduceSinkOperator rs = null;
@@ -122,6 +125,10 @@ public class LimitPushdownOptimizer implements Transform {
         }
       }
       if (rs != null) {
+        if (OperatorUtils.findOperators(rs, GroupByOperator.class).size() > 1){
+          // Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See HIVE-10607 for more.
+          return false;
+        }
         LimitOperator limit = (LimitOperator) nd;
         rs.getConf().setTopN(limit.getConf().getLimit());
         rs.getConf().setTopNMemoryUsage(((LimitPushdownContext) procCtx).threshold);
@@ -135,7 +142,7 @@ public class LimitPushdownOptimizer implements Transform {
 
   private static class LimitPushdownContext implements NodeProcessorCtx {
 
-    private float threshold;
+    private final float threshold;
 
     public LimitPushdownContext(HiveConf conf) throws SemanticException {
       threshold = conf.getFloatVar(HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE);

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/queries/clientpositive/limit_pushdown.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/limit_pushdown.q b/ql/src/test/queries/clientpositive/limit_pushdown.q
index d93e246..3940564 100644
--- a/ql/src/test/queries/clientpositive/limit_pushdown.q
+++ b/ql/src/test/queries/clientpositive/limit_pushdown.q
@@ -31,6 +31,10 @@ explain
 select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20;
 select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20;
 
+explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20;
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20;
+
 -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20;

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/limit_pushdown.q.out b/ql/src/test/results/clientpositive/limit_pushdown.q.out
index c7ab7b3..6ace047 100644
--- a/ql/src/test/results/clientpositive/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/limit_pushdown.q.out
@@ -504,6 +504,94 @@ POSTHOOK: Input: default@alltypesorc
 -63	19
 -64	24
 NULL	2932
+PREHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: alltypesorc
+            Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: ctinyint (type: tinyint), cdouble (type: double)
+              outputColumnNames: _col0, _col1
+              Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+              Group By Operator
+                keys: _col0 (type: tinyint), _col1 (type: double)
+                mode: hash
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: tinyint), _col1 (type: double)
+                  sort order: ++
+                  Map-reduce partition columns: _col0 (type: tinyint)
+                  Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+      Reduce Operator Tree:
+        Group By Operator
+          keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+          Group By Operator
+            aggregations: count(_col1)
+            keys: _col0 (type: tinyint)
+            mode: complete
+            outputColumnNames: _col0, _col1
+            Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
+            Limit
+              Number of rows: 20
+              Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+              File Output Operator
+                compressed: false
+                Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+                table:
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 20
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46	24
+-47	22
+-48	29
+-49	26
+-50	30
+-51	21
+-52	33
+-53	22
+-54	26
+-55	29
+-56	36
+-57	35
+-58	23
+-59	31
+-60	27
+-61	25
+-62	27
+-63	19
+-64	24
+NULL	2932
 PREHOOK: query: -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
index 01106a4..40af253 100644
--- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
@@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc
 -63	19
 -64	24
 NULL	2932
+PREHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: alltypesorc
+                  Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: ctinyint (type: tinyint), cdouble (type: double)
+                    outputColumnNames: ctinyint, cdouble
+                    Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: ctinyint (type: tinyint), cdouble (type: double)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: tinyint), _col1 (type: double)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: tinyint)
+                        Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 6144 Data size: 188618 Basic stats: COMPLETE Column stats: NONE
+                Group By Operator
+                  aggregations: count(_col1)
+                  keys: _col0 (type: tinyint)
+                  mode: complete
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 3072 Data size: 94309 Basic stats: COMPLETE Column stats: NONE
+                  Limit
+                    Number of rows: 20
+                    Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE
+                      table:
+                          input format: org.apache.hadoop.mapred.TextInputFormat
+                          output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                          serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 20
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46	24
+-47	22
+-48	29
+-49	26
+-50	30
+-51	21
+-52	33
+-53	22
+-54	26
+-55	29
+-56	36
+-57	35
+-58	23
+-59	31
+-60	27
+-61	25
+-62	27
+-63	19
+-64	24
+NULL	2932
 PREHOOK: query: -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
index 952d7ff..7038b4d 100644
--- a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
@@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc
 -63	19
 -64	24
 NULL	2932
+PREHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: alltypesorc
+                  Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: ctinyint (type: tinyint), cdouble (type: double)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: _col0 (type: tinyint), _col1 (type: double)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: tinyint), _col1 (type: double)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: tinyint)
+                        Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+                Group By Operator
+                  aggregations: count(_col1)
+                  keys: _col0 (type: tinyint)
+                  mode: complete
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
+                  Limit
+                    Number of rows: 20
+                    Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE
+                      table:
+                          input format: org.apache.hadoop.mapred.TextInputFormat
+                          output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                          serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 20
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46	24
+-47	22
+-48	29
+-49	26
+-50	30
+-51	21
+-52	33
+-53	22
+-54	26
+-55	29
+-56	36
+-57	35
+-58	23
+-59	31
+-60	27
+-61	25
+-62	27
+-63	19
+-64	24
+NULL	2932
 PREHOOK: query: -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20