You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2015/11/30 20:15:29 UTC
[18/27] hive git commit: HIVE-12503 : GBY-Join transpose rule may go in infinite loop (Ashutosh Chauhan via Jesus Camacho Rodriguez)

HIVE-12503 : GBY-Join transpose rule may go in infinite loop (Ashutosh Chauhan via Jesus Camacho Rodriguez)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a8e61c27
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a8e61c27
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a8e61c27

Branch: refs/heads/master-fixed
Commit: a8e61c27b63e7cacbd817e848a06a17ee2208132
Parents: 18ca715
Author: Ashutosh Chauhan <ha...@apache.org>
Authored: Thu Nov 26 11:39:50 2015 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Nov 30 11:14:37 2015 -0800

----------------------------------------------------------------------
 .../rules/HiveAggregateJoinTransposeRule.java   |  17 ++-
 .../queries/clientpositive/cbo_rp_auto_join1.q  |   2 +-
 .../clientpositive/cbo_rp_auto_join1.q.out      | 125 +++++++++++++-----
 .../clientpositive/groupby_join_pushdown.q.out  | 128 +++++--------------
 4 files changed, 138 insertions(+), 134 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/a8e61c27/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateJoinTransposeRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateJoinTransposeRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateJoinTransposeRule.java
index c59af39..8cbaed0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateJoinTransposeRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateJoinTransposeRule.java
@@ -17,6 +17,7 @@
 package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
 
 import org.apache.calcite.linq4j.Ord;
+import org.apache.calcite.plan.RelOptCost;
 import org.apache.calcite.plan.RelOptRuleCall;
 import org.apache.calcite.plan.RelOptUtil;
 import org.apache.calcite.rel.RelNode;
@@ -295,15 +296,13 @@ public class HiveAggregateJoinTransposeRule extends AggregateJoinTransposeRule {
           Mappings.apply(mapping, aggregate.getGroupSet()),
           Mappings.apply2(mapping, aggregate.getGroupSets()), newAggCalls);
     }
-    call.transformTo(r);
-    // Add original tree as well for potential alternative transformation.
-    // This is modeled after LoptOptimizeJoinRule::findBestOrderings() in
-    // which rule adds multiple transformations and Planner picks the cheapest one.
-    // Hep planner will automatically pick the one with lower cost among two.
-    // For details, see: HepPlanner:applyTransformationResults()
-    // In this case, if ndv is close to # of rows, i.e., group by is not resulting
-    // in any deduction, doing this transformation is not useful.
-    call.transformTo(aggregate);
+
+    // Make a cost based decision to pick cheaper plan
+    RelOptCost afterCost = RelMetadataQuery.getCumulativeCost(r);
+    RelOptCost beforeCost = RelMetadataQuery.getCumulativeCost(aggregate);
+    if (afterCost.isLt(beforeCost)) {
+      call.transformTo(r);
+    }
   }
 
   /** Computes the closure of a set of columns according to a given list of

http://git-wip-us.apache.org/repos/asf/hive/blob/a8e61c27/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q b/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
index b906db2..cbfb5d5 100644
--- a/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
+++ b/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
@@ -3,7 +3,7 @@ set hive.stats.fetch.column.stats=true;
 ;
 
 set hive.exec.reducers.max = 1;
-
+set hive.transpose.aggr.join=true;
 -- SORT_QUERY_RESULTS
 
 CREATE TABLE tbl1(key int, value string) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS;

http://git-wip-us.apache.org/repos/asf/hive/blob/a8e61c27/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out b/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
index 6537a8a..59a2f12 100644
--- a/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
+++ b/ql/src/test/results/clientpositive/cbo_rp_auto_join1.q.out
@@ -933,8 +933,10 @@ select count(*) from
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
-  Stage-2 depends on stages: Stage-1
-  Stage-0 depends on stages: Stage-2
+  Stage-2 depends on stages: Stage-1, Stage-4
+  Stage-3 depends on stages: Stage-2
+  Stage-4 is a root stage
+  Stage-0 depends on stages: Stage-3
 
 STAGE PLANS:
   Stage: Stage-1
@@ -947,41 +949,67 @@ STAGE PLANS:
               predicate: (key + 1) is not null (type: boolean)
               Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
               Select Operator
-                expressions: key (type: int)
+                expressions: (key + 1) (type: int)
                 outputColumnNames: key
                 Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
-                Reduce Output Operator
-                  key expressions: (key + 1) (type: int)
-                  sort order: +
-                  Map-reduce partition columns: (key + 1) (type: int)
-                  Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+                Group By Operator
+                  aggregations: count()
+                  keys: key (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int)
+                    sort order: +
+                    Map-reduce partition columns: _col0 (type: int)
+                    Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+                    value expressions: _col1 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: int)
+          mode: mergepartial
+          outputColumnNames: key, $f1
+          Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+          File Output Operator
+            compressed: false
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
           TableScan
-            alias: subq2:a
-            Statistics: Num rows: 10 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
-            Filter Operator
-              predicate: (key + 1) is not null (type: boolean)
-              Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
-              Select Operator
-                expressions: key (type: int)
-                outputColumnNames: key
-                Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
-                Reduce Output Operator
-                  key expressions: (key + 1) (type: int)
-                  sort order: +
-                  Map-reduce partition columns: (key + 1) (type: int)
-                  Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+            Reduce Output Operator
+              key expressions: key (type: int)
+              sort order: +
+              Map-reduce partition columns: key (type: int)
+              Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+              value expressions: $f1 (type: bigint)
+          TableScan
+            Reduce Output Operator
+              key expressions: key (type: int)
+              sort order: +
+              Map-reduce partition columns: key (type: int)
+              Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+              value expressions: $f1 (type: bigint)
       Reduce Operator Tree:
         Join Operator
           condition map:
                Inner Join 0 to 1
           keys:
-            0 (key + 1) (type: int)
-            1 (key + 1) (type: int)
-          Statistics: Num rows: 5 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
+            0 key (type: int)
+            1 key (type: int)
+          outputColumnNames: $f1, $f10
+          Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE
           Select Operator
-            Statistics: Num rows: 5 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
+            expressions: ($f1 * $f10) (type: bigint)
+            outputColumnNames: $f4
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
             Group By Operator
-              aggregations: count()
+              aggregations: $sum0($f4)
               mode: hash
               outputColumnNames: _col0
               Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
@@ -992,7 +1020,7 @@ STAGE PLANS:
                     output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                     serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
 
-  Stage: Stage-2
+  Stage: Stage-3
     Map Reduce
       Map Operator Tree:
           TableScan
@@ -1002,7 +1030,7 @@ STAGE PLANS:
               value expressions: _col0 (type: bigint)
       Reduce Operator Tree:
         Group By Operator
-          aggregations: count(VALUE._col0)
+          aggregations: $sum0(VALUE._col0)
           mode: mergepartial
           outputColumnNames: $f0
           Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
@@ -1014,6 +1042,45 @@ STAGE PLANS:
                 output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
 
+  Stage: Stage-4
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: subq2:a
+            Statistics: Num rows: 10 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+            Filter Operator
+              predicate: (key + 1) is not null (type: boolean)
+              Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+              Select Operator
+                expressions: (key + 1) (type: int)
+                outputColumnNames: key
+                Statistics: Num rows: 5 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+                Group By Operator
+                  aggregations: count()
+                  keys: key (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int)
+                    sort order: +
+                    Map-reduce partition columns: _col0 (type: int)
+                    Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+                    value expressions: _col1 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: int)
+          mode: mergepartial
+          outputColumnNames: key, $f1
+          Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+          File Output Operator
+            compressed: false
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
   Stage: Stage-0
     Fetch Operator
       limit: -1

http://git-wip-us.apache.org/repos/asf/hive/blob/a8e61c27/ql/src/test/results/clientpositive/groupby_join_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/groupby_join_pushdown.q.out b/ql/src/test/results/clientpositive/groupby_join_pushdown.q.out
index 17df98f..c18e62f 100644
--- a/ql/src/test/results/clientpositive/groupby_join_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/groupby_join_pushdown.q.out
@@ -540,10 +540,8 @@ GROUP BY f.ctinyint, g.ctinyint
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
-  Stage-2 depends on stages: Stage-1, Stage-4
-  Stage-3 depends on stages: Stage-2
-  Stage-4 is a root stage
-  Stage-0 depends on stages: Stage-3
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
 
 STAGE PLANS:
   Stage: Stage-1
@@ -559,49 +557,28 @@ STAGE PLANS:
                 expressions: ctinyint (type: tinyint), cint (type: int), cbigint (type: bigint)
                 outputColumnNames: _col0, _col1, _col2
                 Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-                Group By Operator
-                  aggregations: sum(_col2)
-                  keys: _col0 (type: tinyint), _col1 (type: int)
-                  mode: hash
-                  outputColumnNames: _col0, _col1, _col2
+                Reduce Output Operator
+                  key expressions: _col1 (type: int)
+                  sort order: +
+                  Map-reduce partition columns: _col1 (type: int)
                   Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-                  Reduce Output Operator
-                    key expressions: _col0 (type: tinyint), _col1 (type: int)
-                    sort order: ++
-                    Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: int)
-                    Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-                    value expressions: _col2 (type: bigint)
-      Reduce Operator Tree:
-        Group By Operator
-          aggregations: sum(VALUE._col0)
-          keys: KEY._col0 (type: tinyint), KEY._col1 (type: int)
-          mode: mergepartial
-          outputColumnNames: _col0, _col1, _col2
-          Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
-          File Output Operator
-            compressed: false
-            table:
-                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
-                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
-
-  Stage: Stage-2
-    Map Reduce
-      Map Operator Tree:
-          TableScan
-            Reduce Output Operator
-              key expressions: _col1 (type: int)
-              sort order: +
-              Map-reduce partition columns: _col1 (type: int)
-              Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
-              value expressions: _col0 (type: tinyint), _col2 (type: bigint)
+                  value expressions: _col0 (type: tinyint), _col2 (type: bigint)
           TableScan
-            Reduce Output Operator
-              key expressions: _col1 (type: int)
-              sort order: +
-              Map-reduce partition columns: _col1 (type: int)
-              Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
-              value expressions: _col0 (type: tinyint), _col2 (type: bigint)
+            alias: f
+            Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: cint is not null (type: boolean)
+              Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+              Select Operator
+                expressions: ctinyint (type: tinyint), cint (type: int)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col1 (type: int)
+                  sort order: +
+                  Map-reduce partition columns: _col1 (type: int)
+                  Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col0 (type: tinyint)
       Reduce Operator Tree:
         Join Operator
           condition map:
@@ -609,18 +586,18 @@ STAGE PLANS:
           keys:
             0 _col1 (type: int)
             1 _col1 (type: int)
-          outputColumnNames: _col0, _col2, _col3, _col5
-          Statistics: Num rows: 3379 Data size: 726540 Basic stats: COMPLETE Column stats: NONE
+          outputColumnNames: _col0, _col2, _col3
+          Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE
           Select Operator
-            expressions: _col0 (type: tinyint), _col3 (type: tinyint), (_col2 * _col5) (type: bigint)
-            outputColumnNames: _col0, _col3, _col6
-            Statistics: Num rows: 3379 Data size: 726540 Basic stats: COMPLETE Column stats: NONE
+            expressions: _col0 (type: tinyint), _col3 (type: tinyint), _col2 (type: bigint)
+            outputColumnNames: _col0, _col3, _col2
+            Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE
             Group By Operator
-              aggregations: sum(_col6)
+              aggregations: sum(_col2)
               keys: _col0 (type: tinyint), _col3 (type: tinyint)
               mode: hash
               outputColumnNames: _col0, _col1, _col2
-              Statistics: Num rows: 3379 Data size: 726540 Basic stats: COMPLETE Column stats: NONE
+              Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE
               File Output Operator
                 compressed: false
                 table:
@@ -628,7 +605,7 @@ STAGE PLANS:
                     output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                     serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
 
-  Stage: Stage-3
+  Stage: Stage-2
     Map Reduce
       Map Operator Tree:
           TableScan
@@ -636,7 +613,7 @@ STAGE PLANS:
               key expressions: _col0 (type: tinyint), _col1 (type: tinyint)
               sort order: ++
               Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: tinyint)
-              Statistics: Num rows: 3379 Data size: 726540 Basic stats: COMPLETE Column stats: NONE
+              Statistics: Num rows: 6758 Data size: 1453080 Basic stats: COMPLETE Column stats: NONE
               value expressions: _col2 (type: bigint)
       Reduce Operator Tree:
         Group By Operator
@@ -644,54 +621,15 @@ STAGE PLANS:
           keys: KEY._col0 (type: tinyint), KEY._col1 (type: tinyint)
           mode: mergepartial
           outputColumnNames: _col0, _col1, _col2
-          Statistics: Num rows: 1689 Data size: 363162 Basic stats: COMPLETE Column stats: NONE
+          Statistics: Num rows: 3379 Data size: 726540 Basic stats: COMPLETE Column stats: NONE
           File Output Operator
             compressed: false
-            Statistics: Num rows: 1689 Data size: 363162 Basic stats: COMPLETE Column stats: NONE
+            Statistics: Num rows: 3379 Data size: 726540 Basic stats: COMPLETE Column stats: NONE
             table:
                 input format: org.apache.hadoop.mapred.TextInputFormat
                 output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
 
-  Stage: Stage-4
-    Map Reduce
-      Map Operator Tree:
-          TableScan
-            alias: f
-            Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
-            Filter Operator
-              predicate: cint is not null (type: boolean)
-              Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-              Select Operator
-                expressions: ctinyint (type: tinyint), cint (type: int)
-                outputColumnNames: _col0, _col1
-                Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-                Group By Operator
-                  aggregations: count()
-                  keys: _col0 (type: tinyint), _col1 (type: int)
-                  mode: hash
-                  outputColumnNames: _col0, _col1, _col2
-                  Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-                  Reduce Output Operator
-                    key expressions: _col0 (type: tinyint), _col1 (type: int)
-                    sort order: ++
-                    Map-reduce partition columns: _col0 (type: tinyint), _col1 (type: int)
-                    Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE
-                    value expressions: _col2 (type: bigint)
-      Reduce Operator Tree:
-        Group By Operator
-          aggregations: count(VALUE._col0)
-          keys: KEY._col0 (type: tinyint), KEY._col1 (type: int)
-          mode: mergepartial
-          outputColumnNames: _col0, _col1, _col2
-          Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE
-          File Output Operator
-            compressed: false
-            table:
-                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
-                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
-
   Stage: Stage-0
     Fetch Operator
       limit: -1