You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jd...@apache.org on 2017/04/24 17:50:22 UTC

hive git commit: HIVE-16441: De-duplicate semijoin branches in n-way joins (Deepak Jaiswal, reviewed by Jason Dere)

Repository: hive
Updated Branches:
  refs/heads/master 26dd70e20 -> abf72b600


HIVE-16441: De-duplicate semijoin branches in n-way joins (Deepak Jaiswal, reviewed by Jason Dere)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/abf72b60
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/abf72b60
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/abf72b60

Branch: refs/heads/master
Commit: abf72b6009b2d6b9b141ea58e3bf65cc11ca0ad1
Parents: 26dd70e
Author: Jason Dere <jd...@hortonworks.com>
Authored: Mon Apr 24 10:49:53 2017 -0700
Committer: Jason Dere <jd...@hortonworks.com>
Committed: Mon Apr 24 10:49:53 2017 -0700

----------------------------------------------------------------------
 .../DynamicPartitionPruningOptimization.java    | 11 +++++++
 .../hadoop/hive/ql/parse/ParseContext.java      | 10 ++++++
 .../hadoop/hive/ql/parse/TaskCompiler.java      |  1 +
 .../hadoop/hive/ql/parse/TezCompiler.java       | 34 +++++++++++---------
 .../llap/dynamic_semijoin_reduction.q.out       | 34 ++++----------------
 .../clientpositive/llap/semijoin_hint.q.out     | 30 +++--------------
 6 files changed, 50 insertions(+), 70 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index eb3eba5..e1a6952 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -459,6 +459,16 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
       return false;
     }
 
+    // Check if there already exists a semijoin branch
+    GroupByOperator gb = parseContext.getColExprToGBMap().get(key);
+    if (gb != null) {
+      // Already an existing semijoin branch, reuse it
+      createFinalRsForSemiJoinOp(parseContext, ts, gb, key, keyBaseAlias,
+              ctx.parent.getChildren().get(0), sjHint != null);
+      // done!
+      return true;
+    }
+
     List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
     keyExprs.add(key);
 
@@ -726,6 +736,7 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
     runtimeValuesInfo.setColExprs(rsValueCols);
     runtimeValuesInfo.setTsColExpr(colExpr);
     parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
+    parseContext.getColExprToGBMap().put(key, gb);
   }
 
   private Map<Node, Object> collectDynamicPruningConditions(ExprNodeDesc pred, NodeProcessorCtx ctx)

http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
index 9a69f90..3a1f821 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
@@ -120,6 +120,8 @@ public class ParseContext {
           new HashMap<ReduceSinkOperator, RuntimeValuesInfo>();
   private Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo =
           new HashMap<>();
+  private Map<ExprNodeDesc, GroupByOperator> colExprToGBMap =
+          new HashMap<>();
 
   public ParseContext() {
   }
@@ -662,4 +664,12 @@ public class ParseContext {
   public Map<ReduceSinkOperator, SemiJoinBranchInfo> getRsToSemiJoinBranchInfo() {
     return rsToSemiJoinBranchInfo;
   }
+
+  public void setColExprToGBMap(Map<ExprNodeDesc, GroupByOperator> colExprToGBMap) {
+    this.colExprToGBMap = colExprToGBMap;
+  }
+
+  public Map<ExprNodeDesc, GroupByOperator> getColExprToGBMap() {
+    return colExprToGBMap;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
index 96525b4..5ea7800 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
@@ -532,6 +532,7 @@ public abstract class TaskCompiler {
     clone.setMapJoinOps(pCtx.getMapJoinOps());
     clone.setRsToRuntimeValuesInfoMap(pCtx.getRsToRuntimeValuesInfoMap());
     clone.setRsToSemiJoinBranchInfo(pCtx.getRsToSemiJoinBranchInfo());
+    clone.setColExprToGBMap(pCtx.getColExprToGBMap());
 
     return clone;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index f87ca28..078704e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -829,8 +829,6 @@ public class TezCompiler extends TaskCompiler {
       GroupByOperator gbOp = (GroupByOperator) (stack.get(stack.size() - 2));
       GroupByDesc gbDesc = gbOp.getConf();
       ArrayList<AggregationDesc> aggregationDescs = gbDesc.getAggregators();
-      boolean removeSemiJoin = false;
-      TableScanOperator ts = sjInfo.getTsOp();
       for (AggregationDesc agg : aggregationDescs) {
         if (agg.getGenericUDAFName() != "bloom_filter") {
           continue;
@@ -844,36 +842,40 @@ public class TezCompiler extends TaskCompiler {
         long expectedEntries = udafBloomFilterEvaluator.getExpectedEntries();
         if (expectedEntries == -1 || expectedEntries >
                 pCtx.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)) {
-          removeSemiJoin = true;
-          if (LOG.isDebugEnabled()) {
-            LOG.debug("expectedEntries=" + expectedEntries + ". "
-                    + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
-                    + "Removing semijoin "
-                    + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+          // Remove the semijoin optimization branch along with ALL the mappings
+          // The parent GB2 has all the branches. Collect them and remove them.
+          for (Operator<?> op : gbOp.getChildOperators()) {
+            ReduceSinkOperator rsFinal = (ReduceSinkOperator) op;
+            TableScanOperator ts = pCtx.getRsToSemiJoinBranchInfo().
+                    get(rsFinal).getTsOp();
+            if (LOG.isDebugEnabled()) {
+              LOG.debug("expectedEntries=" + expectedEntries + ". "
+                      + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
+                      + "Removing semijoin "
+                      + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+            }
+            GenTezUtils.removeBranch(rsFinal);
+            GenTezUtils.removeSemiJoinOperator(pCtx, rsFinal, ts);
           }
-          break;
+          return null;
         }
       }
 
       // At this point, hinted semijoin case has been handled already
       // Check if big table is big enough that runtime filtering is
       // worth it.
+      TableScanOperator ts = sjInfo.getTsOp();
       if (ts.getStatistics() != null) {
         long numRows = ts.getStatistics().getNumRows();
         if (numRows < pCtx.getConf().getLongVar(ConfVars.TEZ_BIGTABLE_MIN_SIZE_SEMIJOIN_REDUCTION)) {
-          removeSemiJoin = true;
           if (LOG.isDebugEnabled()) {
             LOG.debug("Insufficient rows (" + numRows + ") to justify semijoin optimization. Removing semijoin "
                     + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
           }
+          GenTezUtils.removeBranch(rs);
+          GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
         }
       }
-      if (removeSemiJoin) {
-        // The stats are not annotated, remove the semijoin operator
-        GenTezUtils.removeBranch(rs);
-        GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
-      }
-
       return null;
     }
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
index a47ce6e..1d1f86b 100644
--- a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
+++ b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
@@ -692,13 +692,12 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Reducer 5 (BROADCAST_EDGE), Reducer 8 (BROADCAST_EDGE)
-        Map 7 <- Reducer 6 (BROADCAST_EDGE)
-        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
+        Map 1 <- Reducer 5 (BROADCAST_EDGE), Reducer 7 (BROADCAST_EDGE)
+        Map 6 <- Reducer 5 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
         Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
         Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE)
-        Reducer 6 <- Map 4 (CUSTOM_SIMPLE_EDGE)
-        Reducer 8 <- Map 7 (CUSTOM_SIMPLE_EDGE)
+        Reducer 7 <- Map 6 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -752,22 +751,9 @@ STAGE PLANS:
                             sort order: 
                             Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
                             value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
-                      Select Operator
-                        expressions: _col0 (type: string)
-                        outputColumnNames: _col0
-                        Statistics: Num rows: 20 Data size: 1740 Basic stats: COMPLETE Column stats: PARTIAL
-                        Group By Operator
-                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=32)
-                          mode: hash
-                          outputColumnNames: _col0, _col1, _col2
-                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
-                          Reduce Output Operator
-                            sort order: 
-                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
-                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
             Execution mode: llap
             LLAP IO: all inputs
-        Map 7 
+        Map 6 
             Map Operator Tree:
                 TableScan
                   alias: srcpart_date
@@ -848,19 +834,11 @@ STAGE PLANS:
                   sort order: 
                   Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
                   value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
-        Reducer 6 
-            Execution mode: llap
-            Reduce Operator Tree:
-              Group By Operator
-                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=32)
-                mode: final
-                outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
                 Reduce Output Operator
                   sort order: 
                   Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
                   value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
-        Reducer 8 
+        Reducer 7 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator

http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
index bac9240..bc24893 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
@@ -365,12 +365,11 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
         Map 6 <- Reducer 4 (BROADCAST_EDGE)
-        Map 7 <- Reducer 5 (BROADCAST_EDGE)
-        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
         Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
         Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
-        Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -404,22 +403,9 @@ STAGE PLANS:
                             sort order: 
                             Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
                             value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
-                      Select Operator
-                        expressions: cstring (type: string)
-                        outputColumnNames: _col0
-                        Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
-                        Group By Operator
-                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=3000)
-                          mode: hash
-                          outputColumnNames: _col0, _col1, _col2
-                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
-                          Reduce Output Operator
-                            sort order: 
-                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
-                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
             Execution mode: llap
             LLAP IO: all inputs
-        Map 6 
+        Map 5 
             Map Operator Tree:
                 TableScan
                   alias: v
@@ -439,7 +425,7 @@ STAGE PLANS:
                         Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
             Execution mode: llap
             LLAP IO: all inputs
-        Map 7 
+        Map 6 
             Map Operator Tree:
                 TableScan
                   alias: k
@@ -509,14 +495,6 @@ STAGE PLANS:
                   sort order: 
                   Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
                   value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
-        Reducer 5 
-            Execution mode: llap
-            Reduce Operator Tree:
-              Group By Operator
-                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=3000)
-                mode: final
-                outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
                 Reduce Output Operator
                   sort order: 
                   Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE