You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jd...@apache.org on 2017/04/24 17:50:22 UTC
hive git commit: HIVE-16441: De-duplicate semijoin branches in n-way
joins (Deepak Jaiswal, reviewed by Jason Dere)
Repository: hive
Updated Branches:
refs/heads/master 26dd70e20 -> abf72b600
HIVE-16441: De-duplicate semijoin branches in n-way joins (Deepak Jaiswal, reviewed by Jason Dere)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/abf72b60
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/abf72b60
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/abf72b60
Branch: refs/heads/master
Commit: abf72b6009b2d6b9b141ea58e3bf65cc11ca0ad1
Parents: 26dd70e
Author: Jason Dere <jd...@hortonworks.com>
Authored: Mon Apr 24 10:49:53 2017 -0700
Committer: Jason Dere <jd...@hortonworks.com>
Committed: Mon Apr 24 10:49:53 2017 -0700
----------------------------------------------------------------------
.../DynamicPartitionPruningOptimization.java | 11 +++++++
.../hadoop/hive/ql/parse/ParseContext.java | 10 ++++++
.../hadoop/hive/ql/parse/TaskCompiler.java | 1 +
.../hadoop/hive/ql/parse/TezCompiler.java | 34 +++++++++++---------
.../llap/dynamic_semijoin_reduction.q.out | 34 ++++----------------
.../clientpositive/llap/semijoin_hint.q.out | 30 +++--------------
6 files changed, 50 insertions(+), 70 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index eb3eba5..e1a6952 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -459,6 +459,16 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
return false;
}
+ // Check if there already exists a semijoin branch
+ GroupByOperator gb = parseContext.getColExprToGBMap().get(key);
+ if (gb != null) {
+ // Already an existing semijoin branch, reuse it
+ createFinalRsForSemiJoinOp(parseContext, ts, gb, key, keyBaseAlias,
+ ctx.parent.getChildren().get(0), sjHint != null);
+ // done!
+ return true;
+ }
+
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
keyExprs.add(key);
@@ -726,6 +736,7 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
runtimeValuesInfo.setColExprs(rsValueCols);
runtimeValuesInfo.setTsColExpr(colExpr);
parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
+ parseContext.getColExprToGBMap().put(key, gb);
}
private Map<Node, Object> collectDynamicPruningConditions(ExprNodeDesc pred, NodeProcessorCtx ctx)
http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
index 9a69f90..3a1f821 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
@@ -120,6 +120,8 @@ public class ParseContext {
new HashMap<ReduceSinkOperator, RuntimeValuesInfo>();
private Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo =
new HashMap<>();
+ private Map<ExprNodeDesc, GroupByOperator> colExprToGBMap =
+ new HashMap<>();
public ParseContext() {
}
@@ -662,4 +664,12 @@ public class ParseContext {
public Map<ReduceSinkOperator, SemiJoinBranchInfo> getRsToSemiJoinBranchInfo() {
return rsToSemiJoinBranchInfo;
}
+
+ public void setColExprToGBMap(Map<ExprNodeDesc, GroupByOperator> colExprToGBMap) {
+ this.colExprToGBMap = colExprToGBMap;
+ }
+
+ public Map<ExprNodeDesc, GroupByOperator> getColExprToGBMap() {
+ return colExprToGBMap;
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
index 96525b4..5ea7800 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
@@ -532,6 +532,7 @@ public abstract class TaskCompiler {
clone.setMapJoinOps(pCtx.getMapJoinOps());
clone.setRsToRuntimeValuesInfoMap(pCtx.getRsToRuntimeValuesInfoMap());
clone.setRsToSemiJoinBranchInfo(pCtx.getRsToSemiJoinBranchInfo());
+ clone.setColExprToGBMap(pCtx.getColExprToGBMap());
return clone;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index f87ca28..078704e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -829,8 +829,6 @@ public class TezCompiler extends TaskCompiler {
GroupByOperator gbOp = (GroupByOperator) (stack.get(stack.size() - 2));
GroupByDesc gbDesc = gbOp.getConf();
ArrayList<AggregationDesc> aggregationDescs = gbDesc.getAggregators();
- boolean removeSemiJoin = false;
- TableScanOperator ts = sjInfo.getTsOp();
for (AggregationDesc agg : aggregationDescs) {
if (agg.getGenericUDAFName() != "bloom_filter") {
continue;
@@ -844,36 +842,40 @@ public class TezCompiler extends TaskCompiler {
long expectedEntries = udafBloomFilterEvaluator.getExpectedEntries();
if (expectedEntries == -1 || expectedEntries >
pCtx.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)) {
- removeSemiJoin = true;
- if (LOG.isDebugEnabled()) {
- LOG.debug("expectedEntries=" + expectedEntries + ". "
- + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
- + "Removing semijoin "
- + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+ // Remove the semijoin optimization branch along with ALL the mappings
+ // The parent GB2 has all the branches. Collect them and remove them.
+ for (Operator<?> op : gbOp.getChildOperators()) {
+ ReduceSinkOperator rsFinal = (ReduceSinkOperator) op;
+ TableScanOperator ts = pCtx.getRsToSemiJoinBranchInfo().
+ get(rsFinal).getTsOp();
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("expectedEntries=" + expectedEntries + ". "
+ + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
+ + "Removing semijoin "
+ + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+ }
+ GenTezUtils.removeBranch(rsFinal);
+ GenTezUtils.removeSemiJoinOperator(pCtx, rsFinal, ts);
}
- break;
+ return null;
}
}
// At this point, hinted semijoin case has been handled already
// Check if big table is big enough that runtime filtering is
// worth it.
+ TableScanOperator ts = sjInfo.getTsOp();
if (ts.getStatistics() != null) {
long numRows = ts.getStatistics().getNumRows();
if (numRows < pCtx.getConf().getLongVar(ConfVars.TEZ_BIGTABLE_MIN_SIZE_SEMIJOIN_REDUCTION)) {
- removeSemiJoin = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Insufficient rows (" + numRows + ") to justify semijoin optimization. Removing semijoin "
+ OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
}
+ GenTezUtils.removeBranch(rs);
+ GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
}
}
- if (removeSemiJoin) {
- // The stats are not annotated, remove the semijoin operator
- GenTezUtils.removeBranch(rs);
- GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
- }
-
return null;
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
index a47ce6e..1d1f86b 100644
--- a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
+++ b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out
@@ -692,13 +692,12 @@ STAGE PLANS:
Tez
#### A masked pattern was here ####
Edges:
- Map 1 <- Reducer 5 (BROADCAST_EDGE), Reducer 8 (BROADCAST_EDGE)
- Map 7 <- Reducer 6 (BROADCAST_EDGE)
- Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
+ Map 1 <- Reducer 5 (BROADCAST_EDGE), Reducer 7 (BROADCAST_EDGE)
+ Map 6 <- Reducer 5 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE)
- Reducer 6 <- Map 4 (CUSTOM_SIMPLE_EDGE)
- Reducer 8 <- Map 7 (CUSTOM_SIMPLE_EDGE)
+ Reducer 7 <- Map 6 (CUSTOM_SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -752,22 +751,9 @@ STAGE PLANS:
sort order:
Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
- Select Operator
- expressions: _col0 (type: string)
- outputColumnNames: _col0
- Statistics: Num rows: 20 Data size: 1740 Basic stats: COMPLETE Column stats: PARTIAL
- Group By Operator
- aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=32)
- mode: hash
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
- value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
Execution mode: llap
LLAP IO: all inputs
- Map 7
+ Map 6
Map Operator Tree:
TableScan
alias: srcpart_date
@@ -848,19 +834,11 @@ STAGE PLANS:
sort order:
Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
- Reducer 6
- Execution mode: llap
- Reduce Operator Tree:
- Group By Operator
- aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=32)
- mode: final
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
sort order:
Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
- Reducer 8
+ Reducer 7
Execution mode: llap
Reduce Operator Tree:
Group By Operator
http://git-wip-us.apache.org/repos/asf/hive/blob/abf72b60/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
index bac9240..bc24893 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
@@ -365,12 +365,11 @@ STAGE PLANS:
Tez
#### A masked pattern was here ####
Edges:
+ Map 5 <- Reducer 4 (BROADCAST_EDGE)
Map 6 <- Reducer 4 (BROADCAST_EDGE)
- Map 7 <- Reducer 5 (BROADCAST_EDGE)
- Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
- Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -404,22 +403,9 @@ STAGE PLANS:
sort order:
Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
- Select Operator
- expressions: cstring (type: string)
- outputColumnNames: _col0
- Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
- Group By Operator
- aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=3000)
- mode: hash
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
- Reduce Output Operator
- sort order:
- Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
- value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
Execution mode: llap
LLAP IO: all inputs
- Map 6
+ Map 5
Map Operator Tree:
TableScan
alias: v
@@ -439,7 +425,7 @@ STAGE PLANS:
Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
LLAP IO: all inputs
- Map 7
+ Map 6
Map Operator Tree:
TableScan
alias: k
@@ -509,14 +495,6 @@ STAGE PLANS:
sort order:
Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
- Reducer 5
- Execution mode: llap
- Reduce Operator Tree:
- Group By Operator
- aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=3000)
- mode: final
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
sort order:
Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE