You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by gu...@apache.org on 2017/04/20 17:18:52 UTC
[1/5] hive git commit: HIVE-16423: Add hint to enforce semi join
optimization (Deepak Jaiswal, reviewed by Jason Dere)
Repository: hive
Updated Branches:
refs/heads/master fa24d4b9b -> 9d5d737db
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
new file mode 100644
index 0000000..5d7b9e5
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+
+public class SemiJoinBranchInfo {
+ private TableScanOperator ts;
+ private boolean isHint;
+
+ public SemiJoinBranchInfo(TableScanOperator ts) {
+ this.ts = ts;
+ isHint = false;
+ }
+
+ public SemiJoinBranchInfo(TableScanOperator ts, boolean isHint) {
+ this.ts = ts;
+ this.isHint = isHint;
+ }
+
+ public TableScanOperator getTsOp() {
+ return ts;
+ }
+
+ public boolean getIsHint() {
+ return isHint;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
new file mode 100644
index 0000000..1f24e23
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+public class SemiJoinHint {
+ private String tabAlias;
+ private String colName;
+ private Integer numEntries;
+
+ public SemiJoinHint(String tabAlias, String colName, Integer numEntries) {
+ this.tabAlias = tabAlias;
+ this.colName = colName;
+ this.numEntries = numEntries;
+ }
+
+ public String getTabAlias() {
+ return tabAlias;
+ }
+
+ public String getColName() {
+ return colName;
+ }
+
+ public Integer getNumEntries() {
+ return numEntries != null ? numEntries : -1;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
index 7caeb78..96525b4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
@@ -531,7 +531,7 @@ public abstract class TaskCompiler {
clone.setLineageInfo(pCtx.getLineageInfo());
clone.setMapJoinOps(pCtx.getMapJoinOps());
clone.setRsToRuntimeValuesInfoMap(pCtx.getRsToRuntimeValuesInfoMap());
- clone.setRsOpToTsOpMap(pCtx.getRsOpToTsOpMap());
+ clone.setRsToSemiJoinBranchInfo(pCtx.getRsToSemiJoinBranchInfo());
return clone;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index eaad988..26eda04 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -178,6 +178,9 @@ public class TezCompiler extends TaskCompiler {
TableScanOperator victimTS = null;
ReduceSinkOperator victimRS = null;
+ // If there is a hint and no operator is removed then throw error
+ boolean hasHint = false;
+ boolean removed = false;
for (Operator<?> o : component) {
// Look for AppMasterEventOperator or ReduceSinkOperator
if (o instanceof AppMasterEventOperator) {
@@ -185,25 +188,34 @@ public class TezCompiler extends TaskCompiler {
|| o.getStatistics().getDataSize() < victimAM.getStatistics()
.getDataSize()) {
victimAM = (AppMasterEventOperator) o;
+ removed = true;
}
} else if (o instanceof ReduceSinkOperator) {
- TableScanOperator ts = context.parseContext.getRsOpToTsOpMap().get(o);
- if (ts == null) {
+
+ SemiJoinBranchInfo sjInfo =
+ context.parseContext.getRsToSemiJoinBranchInfo().get(o);
+ if (sjInfo == null ) continue;
+ if (sjInfo.getIsHint()) {
+ // Skipping because of hint. Mark this info,
+ hasHint = true;
continue;
}
+
+ TableScanOperator ts = sjInfo.getTsOp();
// Sanity check
assert component.contains(ts);
if (victimRS == null ||
ts.getStatistics().getDataSize() <
- victimTS.getStatistics().getDataSize()) {
- victimRS = (ReduceSinkOperator) o;
- victimTS = ts;
- }
+ victimTS.getStatistics().getDataSize()) {
+ victimRS = (ReduceSinkOperator) o;
+ victimTS = ts;
+ removed = true;
}
}
+ }
- // Always set the min/max optimization as victim.
+ // Always set the semijoin optimization as victim.
Operator<?> victim = victimRS;
if (victimRS == null && victimAM != null ) {
@@ -227,6 +239,11 @@ public class TezCompiler extends TaskCompiler {
}
}
+ if (hasHint && !removed) {
+ // There is hint but none of the operators removed. Throw error
+ throw new SemanticException("The user hint is causing an operator cycle. Please fix it and retry");
+ }
+
if (victim == null ||
(!context.pruningOpsRemovedByPriorOpt.isEmpty() &&
context.pruningOpsRemovedByPriorOpt.contains(victim))) {
@@ -287,11 +304,12 @@ public class TezCompiler extends TaskCompiler {
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
} else if (o instanceof ReduceSinkOperator){
- // min/max case
+ // semijoin case
children = new ArrayList<Operator<?>>();
children.addAll(o.getChildOperators());
- TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(o);
- if (ts != null) {
+ SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
+ if (sjInfo != null ) {
+ TableScanOperator ts = sjInfo.getTsOp();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
}
@@ -460,7 +478,7 @@ public class TezCompiler extends TaskCompiler {
if (pCtx.getRsToRuntimeValuesInfoMap().size() > 0) {
for (ReduceSinkOperator rs : pCtx.getRsToRuntimeValuesInfoMap().keySet()) {
// Process min/max
- GenTezUtils.processDynamicMinMaxPushDownOperator(
+ GenTezUtils.processDynamicSemiJoinPushDownOperator(
procCtx, pCtx.getRsToRuntimeValuesInfoMap().get(rs), rs);
}
}
@@ -617,7 +635,7 @@ public class TezCompiler extends TaskCompiler {
private static void removeSemijoinOptimizationFromSMBJoins(
OptimizeTezProcContext procCtx) throws SemanticException {
if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) ||
- procCtx.parseContext.getRsOpToTsOpMap().size() == 0) {
+ procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
return;
}
@@ -636,9 +654,9 @@ public class TezCompiler extends TaskCompiler {
GraphWalker ogw = new PreOrderOnceWalker(disp);
ogw.startWalking(topNodes, null);
+ List<TableScanOperator> tsOps = new ArrayList<>();
// Iterate over the map and remove semijoin optimizations if needed.
for (CommonMergeJoinOperator joinOp : ctx.JoinOpToTsOpMap.keySet()) {
- List<TableScanOperator> tsOps = new ArrayList<TableScanOperator>();
// Get one top level TS Op directly from the stack
tsOps.add(ctx.JoinOpToTsOpMap.get(joinOp));
@@ -651,7 +669,7 @@ public class TezCompiler extends TaskCompiler {
}
assert parent instanceof SelectOperator;
- while(parent != null) {
+ while (parent != null) {
if (parent instanceof TableScanOperator) {
tsOps.add((TableScanOperator) parent);
break;
@@ -659,20 +677,24 @@ public class TezCompiler extends TaskCompiler {
parent = parent.getParentOperators().get(0);
}
}
-
- // Now the relevant TableScanOperators are known, find if there exists
- // a semijoin filter on any of them, if so, remove it.
- ParseContext pctx = procCtx.parseContext;
- for (TableScanOperator ts : tsOps) {
- for (ReduceSinkOperator rs : pctx.getRsOpToTsOpMap().keySet()) {
- if (ts == pctx.getRsOpToTsOpMap().get(rs)) {
- // match!
- if (LOG.isDebugEnabled()) {
- LOG.debug("Semijoin optimization found going to SMB join. Removing semijoin "
- + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
- }
- GenTezUtils.removeBranch(rs);
- GenTezUtils.removeSemiJoinOperator(pctx, rs, ts);
+ }
+ // Now the relevant TableScanOperators are known, find if there exists
+ // a semijoin filter on any of them, if so, remove it.
+
+ ParseContext pctx = procCtx.parseContext;
+ for (TableScanOperator ts : tsOps) {
+ for (ReduceSinkOperator rs : pctx.getRsToSemiJoinBranchInfo().keySet()) {
+ SemiJoinBranchInfo sjInfo = pctx.getRsToSemiJoinBranchInfo().get(rs);
+ if (ts == sjInfo.getTsOp()) {
+ // match!
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Semijoin optimization found going to SMB join. Removing semijoin "
+ + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+ }
+ GenTezUtils.removeBranch(rs);
+ GenTezUtils.removeSemiJoinOperator(pctx, rs, ts);
+ if (sjInfo.getIsHint()) {
+ LOG.debug("Removing hinted semijoin as it is with SMB join " + rs + " : " + ts);
}
}
}
@@ -699,7 +721,7 @@ public class TezCompiler extends TaskCompiler {
private static void removeSemiJoinCyclesDueToMapsideJoins(
OptimizeTezProcContext procCtx) throws SemanticException {
if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) ||
- procCtx.parseContext.getRsOpToTsOpMap().size() == 0) {
+ procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
return;
}
@@ -752,10 +774,10 @@ public class TezCompiler extends TaskCompiler {
}
ReduceSinkOperator rs = ((ReduceSinkOperator) child);
- TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs);
- if (ts == null) {
- continue;
- }
+ SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
+ if (sjInfo == null) continue;
+
+ TableScanOperator ts = sjInfo.getTsOp();
// This is a semijoin branch. Find if this is creating a potential
// cycle with childJoin.
for (Operator<?> parent : childJoin.getParentOperators()) {
@@ -776,6 +798,9 @@ public class TezCompiler extends TaskCompiler {
}
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
+ if (sjInfo.getIsHint()) {
+ LOG.debug("Removing hinted semijoin as it is creating cycles with mapside joins " + rs + " : " + ts);
+ }
}
}
}
@@ -790,8 +815,8 @@ public class TezCompiler extends TaskCompiler {
assert nd instanceof ReduceSinkOperator;
ReduceSinkOperator rs = (ReduceSinkOperator) nd;
ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext;
- TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs);
- if (ts == null) {
+ SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
+ if (sjInfo == null) {
// nothing to do here.
return null;
}
@@ -802,6 +827,7 @@ public class TezCompiler extends TaskCompiler {
GroupByDesc gbDesc = gbOp.getConf();
ArrayList<AggregationDesc> aggregationDescs = gbDesc.getAggregators();
boolean removeSemiJoin = false;
+ TableScanOperator ts = sjInfo.getTsOp();
for (AggregationDesc agg : aggregationDescs) {
if (agg.getGenericUDAFName() != "bloom_filter") {
continue;
@@ -809,20 +835,24 @@ public class TezCompiler extends TaskCompiler {
GenericUDAFBloomFilterEvaluator udafBloomFilterEvaluator =
(GenericUDAFBloomFilterEvaluator) agg.getGenericUDAFEvaluator();
+ if (udafBloomFilterEvaluator.hasHintEntries())
+ return null; // Created using hint, skip it
+
long expectedEntries = udafBloomFilterEvaluator.getExpectedEntries();
if (expectedEntries == -1 || expectedEntries >
pCtx.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)) {
removeSemiJoin = true;
if (LOG.isDebugEnabled()) {
LOG.debug("expectedEntries=" + expectedEntries + ". "
- + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
- + "Removing semijoin "
- + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+ + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
+ + "Removing semijoin "
+ + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
}
break;
}
}
+ // At this point, hinted semijoin case has been handled already
// Check if big table is big enough that runtime filtering is
// worth it.
if (ts.getStatistics() != null) {
@@ -831,16 +861,16 @@ public class TezCompiler extends TaskCompiler {
removeSemiJoin = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Insufficient rows (" + numRows + ") to justify semijoin optimization. Removing semijoin "
- + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+ + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
}
}
}
-
if (removeSemiJoin) {
// The stats are not annotated, remove the semijoin operator
GenTezUtils.removeBranch(rs);
GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
}
+
return null;
}
}
@@ -905,15 +935,23 @@ public class TezCompiler extends TaskCompiler {
}
ReduceSinkOperator rs = (ReduceSinkOperator) child;
- TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(rs);
- if (ts == null || ts != bigTableTS) {
- // skip, no semijoin or not the one we are looking for.
+ SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(rs);
+ if (sjInfo == null) continue;
+
+ TableScanOperator ts = sjInfo.getTsOp();
+ if (ts != bigTableTS) {
+ // skip, not the one we are looking for.
continue;
}
+ parallelEdges = true;
+
+ if (sjInfo.getIsHint()) {
+ // Created by hint, skip it
+ continue;
+ }
// Add the semijoin branch to the map
semijoins.put(rs, ts);
- parallelEdges = true;
}
}
}
@@ -1141,10 +1179,15 @@ public class TezCompiler extends TaskCompiler {
}
List<ReduceSinkOperator> semijoinRsToRemove = new ArrayList<ReduceSinkOperator>();
- Map<ReduceSinkOperator, TableScanOperator> map = procCtx.parseContext.getRsOpToTsOpMap();
+ Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
double semijoinReductionThreshold = procCtx.conf.getFloatVar(
HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
for (ReduceSinkOperator rs : map.keySet()) {
+ SemiJoinBranchInfo sjInfo = map.get(rs);
+ if (sjInfo.getIsHint()) {
+ // Semijoin created using hint, skip it
+ continue;
+ }
// rs is semijoin optimization branch, which should look like <Parent>-SEL-GB1-RS1-GB2-RS2
// Get to the SelectOperator ancestor
SelectOperator sel = null;
@@ -1159,7 +1202,7 @@ public class TezCompiler extends TaskCompiler {
}
// Check the ndv/rows from the SEL vs the destination tablescan the semijoin opt is going to.
- TableScanOperator ts = map.get(rs);
+ TableScanOperator ts = sjInfo.getTsOp();
RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
ExprNodeDesc tsExpr = rti.getTsColExpr();
// In the SEL operator of the semijoin branch, there should be only one column in the operator
@@ -1179,7 +1222,7 @@ public class TezCompiler extends TaskCompiler {
}
for (ReduceSinkOperator rs : semijoinRsToRemove) {
- TableScanOperator ts = map.get(rs);
+ TableScanOperator ts = map.get(rs).getTsOp();
if (LOG.isDebugEnabled()) {
LOG.debug("Reduction factor not satisfied for " + OperatorUtils.getOpNamePretty(rs)
+ "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin optimization.");
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
index 18e4fbd..3143554 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
@@ -18,7 +18,10 @@
package org.apache.hadoop.hive.ql.plan;
+import java.util.Map;
+
import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
/**
@@ -29,14 +32,17 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
Operator<? extends OperatorDesc> source;
int keyIndex;
+ Map<String, SemiJoinHint> hints;
public ExprNodeDynamicListDesc() {
}
- public ExprNodeDynamicListDesc(TypeInfo typeInfo, Operator<? extends OperatorDesc> source, int keyIndex) {
+ public ExprNodeDynamicListDesc(TypeInfo typeInfo, Operator<? extends OperatorDesc> source,
+ int keyIndex, Map<String, SemiJoinHint> hints) {
super(typeInfo);
this.source = source;
this.keyIndex = keyIndex;
+ this.hints = hints;
}
public void setSource(Operator<? extends OperatorDesc> source) {
@@ -57,8 +63,7 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
@Override
public ExprNodeDesc clone() {
- ExprNodeDynamicListDesc clone = new ExprNodeDynamicListDesc(typeInfo, source, keyIndex);
- return clone;
+ return new ExprNodeDynamicListDesc(typeInfo, source, keyIndex, hints);
}
@Override
@@ -78,4 +83,8 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
public String toString() {
return source.toString();
}
+
+ public Map<String, SemiJoinHint> getHints() {
+ return hints;
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
index bcf3691..032c7bb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
@@ -29,6 +29,7 @@ import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
@@ -106,6 +107,10 @@ public class JoinDesc extends AbstractOperatorDesc {
private transient Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo;
private transient boolean leftInputJoin;
private transient List<String> streamAliases;
+ // Note: there are two things in Hive called semi-joins - the left semi join construct,
+ // and also a bloom-filter based optimization that came later. This is for the latter.
+ // Everything else in this desc that says "semi-join" is for the former.
+ private transient Map<String, SemiJoinHint> semiJoinHints;
public JoinDesc() {
}
@@ -197,6 +202,7 @@ public class JoinDesc extends AbstractOperatorDesc {
this.filterMap = clone.filterMap;
this.residualFilterExprs = clone.residualFilterExprs;
this.statistics = clone.statistics;
+ this.semiJoinHints = clone.semiJoinHints;
}
public Map<Byte, List<ExprNodeDesc>> getExprs() {
@@ -682,4 +688,16 @@ public class JoinDesc extends AbstractOperatorDesc {
streamAliases = joinDesc.streamAliases == null ? null : new ArrayList<String>(joinDesc.streamAliases);
}
+ private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(JoinDesc.class);
+ public void setSemiJoinHints(Map<String, SemiJoinHint> semiJoinHints) {
+ if (semiJoinHints != null || this.semiJoinHints != null) {
+ LOG.debug("Setting semi-join hints to " + semiJoinHints);
+ }
+ this.semiJoinHints = semiJoinHints;
+ }
+
+ public Map<String, SemiJoinHint> getSemiJoinHints() {
+ return semiJoinHints;
+ }
+
}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
index 71c7310..f45daa8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
@@ -26,6 +26,7 @@ import java.util.Map;
import java.util.Set;
import java.util.Stack;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -134,13 +135,12 @@ public class SyntheticJoinPredicate extends Transform {
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
- ParseContext pCtx = ((SyntheticContext) procCtx).getParseContext();
-
@SuppressWarnings("unchecked")
CommonJoinOperator<JoinDesc> join = (CommonJoinOperator<JoinDesc>) nd;
ReduceSinkOperator source = (ReduceSinkOperator) stack.get(stack.size() - 2);
int srcPos = join.getParentOperators().indexOf(source);
+ Map<String, SemiJoinHint> hints = join.getConf().getSemiJoinHints();
List<Operator<? extends OperatorDesc>> parents = join.getParentOperators();
@@ -181,7 +181,7 @@ public class SyntheticJoinPredicate extends Transform {
inArgs.add(sourceKeys.get(i));
ExprNodeDynamicListDesc dynamicExpr =
- new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), target, i);
+ new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), target, i, hints);
inArgs.add(dynamicExpr);
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
index 2b84beb..2413ae6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
@@ -72,6 +72,7 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 {
public static class GenericUDAFBloomFilterEvaluator extends GenericUDAFEvaluator {
// Source operator to get the number of entries
private SelectOperator sourceOperator;
+ private long hintEntries = -1;
private long maxEntries = 0;
private long minEntries = 0;
private float factor = 1;
@@ -254,6 +255,10 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 {
}
public long getExpectedEntries() {
+ // If hint is provided use that size.
+ if (hintEntries > 0 )
+ return hintEntries;
+
long expectedEntries = -1;
if (sourceOperator != null && sourceOperator.getStatistics() != null) {
Statistics stats = sourceOperator.getStatistics();
@@ -294,6 +299,14 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 {
this.sourceOperator = sourceOperator;
}
+ public void setHintEntries(long hintEntries) {
+ this.hintEntries = hintEntries;
+ }
+
+ public boolean hasHintEntries() {
+ return hintEntries != -1;
+ }
+
public void setMaxEntries(long maxEntries) {
this.maxEntries = maxEntries;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/test/queries/clientpositive/semijoin_hint.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/semijoin_hint.q b/ql/src/test/queries/clientpositive/semijoin_hint.q
new file mode 100644
index 0000000..5de0c8c
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/semijoin_hint.q
@@ -0,0 +1,54 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.cbo.enable=true;
+set hive.compute.query.using.stats=false;
+set hive.mapred.mode=nonstrict;
+set hive.optimize.ppd=true;
+set hive.ppd.remove.duplicatefilters=true;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.optimize.metadataonly=false;
+set hive.optimize.index.filter=true;
+set hive.stats.autogather=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.dynamic.semijoin.reduction.threshold=-999999999999;
+
+-- Create Tables
+create table alltypesorc_int ( cint int, cstring string ) stored as ORC;
+create table srcpart_date (str string, value string) partitioned by (ds string ) stored as ORC;
+CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC;
+
+-- Add Partitions
+alter table srcpart_date add partition (ds = "2008-04-08");
+alter table srcpart_date add partition (ds = "2008-04-09");
+
+alter table srcpart_small add partition (ds = "2008-04-08");
+alter table srcpart_small add partition (ds = "2008-04-09");
+
+-- Load
+insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc;
+insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08";
+insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09";
+insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09";
+analyze table alltypesorc_int compute statistics for columns;
+analyze table srcpart_date compute statistics for columns;
+analyze table srcpart_small compute statistics for columns;
+
+set hive.cbo.returnpath.hiveop=true;
+
+create table srccc as select * from src;
+
+EXPLAIN select /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = i.cstring);
+EXPLAIN select /*+ semi(i, 3000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = i.cstring);
+
+explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1);
+
+set hive.cbo.returnpath.hiveop=false;
+
+explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1);
+
+set hive.cbo.enable=false;
+
+explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1);
+
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
new file mode 100644
index 0000000..bac9240
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
@@ -0,0 +1,899 @@
+PREHOOK: query: create table alltypesorc_int ( cint int, cstring string ) stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@alltypesorc_int
+POSTHOOK: query: create table alltypesorc_int ( cint int, cstring string ) stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@alltypesorc_int
+PREHOOK: query: create table srcpart_date (str string, value string) partitioned by (ds string ) stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: create table srcpart_date (str string, value string) partitioned by (ds string ) stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srcpart_date
+PREHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srcpart_small
+PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-08
+PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+PREHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@alltypesorc_int
+POSTHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@alltypesorc_int
+POSTHOOK: Lineage: alltypesorc_int.cint SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: alltypesorc_int.cstring SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ]
+PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+PREHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).str SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@srcpart_date@ds=2008-04-09
+POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).str SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@srcpart_small@ds=2008-04-09
+POSTHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).key1 SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).value1 SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: analyze table alltypesorc_int compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc_int
+PREHOOK: Output: default@alltypesorc_int
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table alltypesorc_int compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc_int
+POSTHOOK: Output: default@alltypesorc_int
+#### A masked pattern was here ####
+PREHOOK: query: analyze table srcpart_date compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart_date
+PREHOOK: Input: default@srcpart_date@ds=2008-04-08
+PREHOOK: Input: default@srcpart_date@ds=2008-04-09
+PREHOOK: Output: default@srcpart_date
+PREHOOK: Output: default@srcpart_date@ds=2008-04-08
+PREHOOK: Output: default@srcpart_date@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table srcpart_date compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart_date
+POSTHOOK: Input: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Input: default@srcpart_date@ds=2008-04-09
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+#### A masked pattern was here ####
+PREHOOK: query: analyze table srcpart_small compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart_small
+PREHOOK: Input: default@srcpart_small@ds=2008-04-08
+PREHOOK: Input: default@srcpart_small@ds=2008-04-09
+PREHOOK: Output: default@srcpart_small
+PREHOOK: Output: default@srcpart_small@ds=2008-04-08
+PREHOOK: Output: default@srcpart_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table srcpart_small compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart_small
+POSTHOOK: Input: default@srcpart_small@ds=2008-04-08
+POSTHOOK: Input: default@srcpart_small@ds=2008-04-09
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-08
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+#### A masked pattern was here ####
+PREHOOK: query: create table srccc as select * from src
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srccc
+POSTHOOK: query: create table srccc as select * from src
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srccc
+POSTHOOK: Lineage: srccc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srccc.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: EXPLAIN select /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = i.cstring)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = i.cstring)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Reducer 7 (BROADCAST_EDGE)
+ Map 8 <- Reducer 5 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
+ Reducer 3 <- Map 8 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
+ Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE)
+ Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+ Reducer 7 <- Map 6 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: i
+ filterExpr: (cstring is not null and (cstring BETWEEN DynamicValue(RS_7_k_cstring_min) AND DynamicValue(RS_7_k_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_7_k_cstring_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (cstring is not null and (cstring BETWEEN DynamicValue(RS_7_k_cstring_min) AND DynamicValue(RS_7_k_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_7_k_cstring_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: cstring (type: string)
+ outputColumnNames: cstring
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: cstring (type: string)
+ sort order: +
+ Map-reduce partition columns: cstring (type: string)
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 6
+ Map Operator Tree:
+ TableScan
+ alias: k
+ filterExpr: (str is not null and value is not null) (type: boolean)
+ Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (str is not null and value is not null) (type: boolean)
+ Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string), value (type: string)
+ outputColumnNames: str, value
+ Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: value (type: string)
+ sort order: +
+ Map-reduce partition columns: value (type: string)
+ Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: str (type: string)
+ Select Operator
+ expressions: value (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2000 Data size: 182000 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=5000)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 8
+ Map Operator Tree:
+ TableScan
+ alias: v
+ filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_9_i_key1_min) AND DynamicValue(RS_9_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_9_i_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_9_i_key1_min) AND DynamicValue(RS_9_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_9_i_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: key1 (type: string)
+ outputColumnNames: key1
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ key expressions: key1 (type: string)
+ sort order: +
+ Map-reduce partition columns: key1 (type: string)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Execution mode: llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 cstring (type: string)
+ 1 value (type: string)
+ outputColumnNames: str
+ Statistics: Num rows: 3281 Data size: 285447 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: str (type: string)
+ sort order: +
+ Map-reduce partition columns: str (type: string)
+ Statistics: Num rows: 3281 Data size: 285447 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 3281 Data size: 285447 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=410)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 str (type: string)
+ 1 key1 (type: string)
+ Statistics: Num rows: 16004 Data size: 128032 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ Statistics: Num rows: 16004 Data size: 64016 Basic stats: COMPLETE Column stats: PARTIAL
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ value expressions: _col0 (type: bigint)
+ Reducer 4
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: $f0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 5
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=410)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Reducer 7
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=5000)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: EXPLAIN select /*+ semi(i, 3000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = i.cstring)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select /*+ semi(i, 3000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = i.cstring)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 6 <- Reducer 4 (BROADCAST_EDGE)
+ Map 7 <- Reducer 5 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+ Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: i
+ filterExpr: cstring is not null (type: boolean)
+ Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: cstring is not null (type: boolean)
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: cstring (type: string)
+ outputColumnNames: cstring
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: cstring (type: string)
+ sort order: +
+ Map-reduce partition columns: cstring (type: string)
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: cstring (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=3000)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Select Operator
+ expressions: cstring (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=3000)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 6
+ Map Operator Tree:
+ TableScan
+ alias: v
+ filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_i_key1_min) AND DynamicValue(RS_3_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_i_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_i_key1_min) AND DynamicValue(RS_3_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_i_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: key1 (type: string)
+ outputColumnNames: key1
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ key expressions: key1 (type: string)
+ sort order: +
+ Map-reduce partition columns: key1 (type: string)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 7
+ Map Operator Tree:
+ TableScan
+ alias: k
+ filterExpr: (str is not null and (str BETWEEN DynamicValue(RS_3_i_str_min) AND DynamicValue(RS_3_i_str_max) and in_bloom_filter(str, DynamicValue(RS_3_i_str_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (str is not null and (str BETWEEN DynamicValue(RS_3_i_str_min) AND DynamicValue(RS_3_i_str_max) and in_bloom_filter(str, DynamicValue(RS_3_i_str_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string)
+ outputColumnNames: str
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: str (type: string)
+ sort order: +
+ Map-reduce partition columns: str (type: string)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 1 to 2
+ keys:
+ 0 cstring (type: string)
+ 1 key1 (type: string)
+ 2 str (type: string)
+ Statistics: Num rows: 16008 Data size: 128064 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ Statistics: Num rows: 16008 Data size: 64032 Basic stats: COMPLETE Column stats: PARTIAL
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ value expressions: _col0 (type: bigint)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: $f0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 4
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=3000)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Reducer 5
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=3000)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 5 <- Reducer 4 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: k
+ filterExpr: str is not null (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: str is not null (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string)
+ outputColumnNames: str
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: str (type: string)
+ sort order: +
+ Map-reduce partition columns: str (type: string)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: v
+ filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: key1 (type: string)
+ outputColumnNames: key1
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ key expressions: key1 (type: string)
+ sort order: +
+ Map-reduce partition columns: key1 (type: string)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Execution mode: llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 str (type: string)
+ 1 key1 (type: string)
+ Statistics: Num rows: 9756 Data size: 78048 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ Statistics: Num rows: 9756 Data size: 39024 Basic stats: COMPLETE Column stats: PARTIAL
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ value expressions: _col0 (type: bigint)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: $f0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 4
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 5 <- Reducer 4 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: k
+ filterExpr: str is not null (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: str is not null (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: v
+ filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_6_k_key1_min) AND DynamicValue(RS_6_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_6_k_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_6_k_key1_min) AND DynamicValue(RS_6_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_6_k_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: key1 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Execution mode: llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ Statistics: Num rows: 9756 Data size: 78048 Basic stats: COMPLETE Column stats: PARTIAL
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ value expressions: _col0 (type: bigint)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 4
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 5 <- Reducer 4 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: k
+ filterExpr: str is not null (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: str is not null (type: boolean)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: str (type: string)
+ sort order: +
+ Map-reduce partition columns: str (type: string)
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: str (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000)
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+ Execution mode: llap
+ LLAP IO: all inputs
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: v
+ filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Filter Operator
+ predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ key expressions: key1 (type: string)
+ sort order: +
+ Map-reduce partition columns: key1 (type: string)
+ Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+ Execution mode: llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 str (type: string)
+ 1 key1 (type: string)
+ Statistics: Num rows: 9756 Data size: 78048 Basic stats: COMPLETE Column stats: PARTIAL
+ Group By Operator
+ aggregations: count()
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ value expressions: _col0 (type: bigint)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 4
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
[5/5] hive git commit: HIVE-16423: Add hint to enforce semi join
optimization (Deepak Jaiswal, reviewed by Jason Dere)
Posted by gu...@apache.org.
HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/9d5d737d
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/9d5d737d
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/9d5d737d
Branch: refs/heads/master
Commit: 9d5d737db4f715a880f0d544d548a5ce370f602b
Parents: fa24d4b
Author: Gunther Hagleitner <gu...@apache.org>
Authored: Thu Apr 20 10:07:52 2017 -0700
Committer: Gunther Hagleitner <gu...@apache.org>
Committed: Thu Apr 20 10:07:52 2017 -0700
----------------------------------------------------------------------
.../org/apache/hadoop/hive/conf/HiveConf.java | 2 +
.../test/resources/testconfiguration.properties | 1 +
.../hive/ql/optimizer/ConvertJoinMapJoin.java | 4 +-
.../DynamicPartitionPruningOptimization.java | 102 +-
.../calcite/translator/HiveOpConverter.java | 24 +-
.../hadoop/hive/ql/parse/CalcitePlanner.java | 35 +
.../hive/ql/parse/CalcitePlanner.java.orig | 4188 +++++
.../hadoop/hive/ql/parse/GenTezUtils.java | 25 +-
.../apache/hadoop/hive/ql/parse/HintParser.g | 3 +
.../hadoop/hive/ql/parse/ParseContext.java | 25 +-
.../apache/hadoop/hive/ql/parse/QBJoinTree.java | 16 +
.../hadoop/hive/ql/parse/SemanticAnalyzer.java | 63 +
.../hive/ql/parse/SemanticAnalyzer.java.orig | 13508 +++++++++++++++++
.../hive/ql/parse/SemiJoinBranchInfo.java | 45 +
.../hadoop/hive/ql/parse/SemiJoinHint.java | 43 +
.../hadoop/hive/ql/parse/TaskCompiler.java | 2 +-
.../hadoop/hive/ql/parse/TezCompiler.java | 137 +-
.../hive/ql/plan/ExprNodeDynamicListDesc.java | 15 +-
.../apache/hadoop/hive/ql/plan/JoinDesc.java | 18 +
.../hive/ql/ppd/SyntheticJoinPredicate.java | 6 +-
.../ql/udf/generic/GenericUDAFBloomFilter.java | 13 +
.../test/queries/clientpositive/semijoin_hint.q | 54 +
.../clientpositive/llap/semijoin_hint.q.out | 899 ++
23 files changed, 19107 insertions(+), 121 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 420d35e..b10b08e 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2892,6 +2892,8 @@ public class HiveConf extends Configuration {
"Big table for runtime filteting should be of atleast this size"),
TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD("hive.tez.dynamic.semijoin.reduction.threshold", (float) 0.50,
"Only perform semijoin optimization if the estimated benefit at or above this fraction of the target table"),
+ TEZ_DYNAMIC_SEMIJOIN_REDUCTION_HINT_ONLY("hive.tez.dynamic.semijoin.reduction.hint.only", false,
+ "When true, only enforce semijoin when a hint is provided"),
TEZ_SMB_NUMBER_WAVES(
"hive.tez.smb.number.waves",
(float) 0.5,
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index ed5ce9d..116d0eb 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -572,6 +572,7 @@ minillaplocal.query.files=acid_globallimit.q,\
schema_evol_text_vecrow_table.q,\
selectDistinctStar.q,\
semijoin.q,\
+ semijoin_hint.q,\
smb_cache.q,\
special_character_in_tabnames_1.q,\
sqlmerge.q,\
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
index db6b05b..637bc54 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
@@ -794,7 +794,7 @@ public class ConvertJoinMapJoin implements NodeProcessor {
// The semijoin branch can potentially create a task level cycle
// with the hashjoin except when it is dynamically partitioned hash
// join which takes place in a separate task.
- if (context.parseContext.getRsOpToTsOpMap().size() > 0
+ if (context.parseContext.getRsToSemiJoinBranchInfo().size() > 0
&& removeReduceSink) {
removeCycleCreatingSemiJoinOps(mapJoinOp, parentSelectOpOfBigTableOp,
context.parseContext);
@@ -826,7 +826,7 @@ public class ConvertJoinMapJoin implements NodeProcessor {
}
ReduceSinkOperator rs = (ReduceSinkOperator) op;
- TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(rs);
+ TableScanOperator ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
if (ts == null) {
// skip, no semijoin branch
continue;
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index 838cc69..eb3eba5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -43,12 +43,7 @@ import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc;
-import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext;
-import org.apache.hadoop.hive.ql.parse.ParseContext;
-import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
-import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo;
-import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
-import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.*;
import org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext;
import org.apache.hadoop.hive.ql.plan.*;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator;
@@ -215,16 +210,25 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
} else {
LOG.debug("Column " + column + " is not a partition column");
if (semiJoin && ts.getConf().getFilterExpr() != null) {
- LOG.debug("Initiate semijoin reduction for " + column);
- // Get the table name from which the min-max values will come.
+ LOG.debug("Initiate semijoin reduction for " + column + " ("
+ + ts.getConf().getFilterExpr().getExprString());
+ // Get the table name from which the min-max values and bloom filter will come.
Operator<?> op = ctx.generator;
+
while (!(op == null || op instanceof TableScanOperator)) {
op = op.getParentOperators().get(0);
}
String tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias());
+
+ Map<String, SemiJoinHint> hints = ctx.desc.getHints();
+ SemiJoinHint sjHint = (hints != null) ? hints.get(tableAlias) : null;
keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + column;
- semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias);
+ semiJoinAttempted = generateSemiJoinOperatorPlan(
+ ctx, parseContext, ts, keyBaseAlias, sjHint);
+ if (!semiJoinAttempted && sjHint != null) {
+ throw new SemanticException("The user hint to enforce semijoin failed required conditions");
+ }
}
}
@@ -387,7 +391,13 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
// Generates plan for min/max when dynamic partition pruning is ruled out.
private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext,
- TableScanOperator ts, String keyBaseAlias) throws SemanticException {
+ TableScanOperator ts, String keyBaseAlias, SemiJoinHint sjHint) throws SemanticException {
+
+ // If semijoin hint is enforced, make sure hint is provided
+ if (parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_HINT_ONLY)
+ && sjHint == null) {
+ return false;
+ }
// we will put a fork in the plan at the source of the reduce sink
Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
@@ -441,6 +451,14 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
}
}
+ // If hint is provided and only hinted semijoin optimizations should be
+ // created, then skip other columns on the table
+ if (parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_HINT_ONLY)
+ && sjHint.getColName() != null &&
+ !internalColName.equals(sjHint.getColName())) {
+ return false;
+ }
+
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
keyExprs.add(key);
@@ -484,8 +502,6 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
HiveConf.getFloatVar(parseContext.getConf(),
HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
- ArrayList<ExprNodeDesc> groupByExprs = new ArrayList<ExprNodeDesc>();
-
// Add min/max and bloom filter aggregations
List<ObjectInspector> aggFnOIs = new ArrayList<ObjectInspector>();
aggFnOIs.add(key.getWritableObjectInspector());
@@ -505,8 +521,14 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter",
FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false),
params, false, Mode.PARTIAL1);
- GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
+ GenericUDAFBloomFilterEvaluator bloomFilterEval =
+ (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
bloomFilterEval.setSourceOperator(selectOp);
+
+ if (sjHint != null && sjHint.getNumEntries() > 0) {
+ LOG.debug("Setting size for " + keyBaseAlias + " to " + sjHint.getNumEntries() + " based on the hint");
+ bloomFilterEval.setHintEntries(sjHint.getNumEntries());
+ }
bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
@@ -607,6 +629,9 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
bloomFilterFinalParams, false, Mode.FINAL);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
bloomFilterEval.setSourceOperator(selectOp);
+ if (sjHint != null && sjHint.getNumEntries() > 0) {
+ bloomFilterEval.setHintEntries(sjHint.getNumEntries());
+ }
bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
@@ -635,23 +660,56 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
rsOp.getConf().setOutputOperators(outputOperators);
}
+ createFinalRsForSemiJoinOp(parseContext, ts, groupByOpFinal, key,
+ keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
+
+ return true;
+ }
+
+ private void createFinalRsForSemiJoinOp(
+ ParseContext parseContext, TableScanOperator ts, GroupByOperator gb,
+ ExprNodeDesc key, String keyBaseAlias, ExprNodeDesc colExpr,
+ boolean isHint) throws SemanticException {
+ ArrayList<String> gbOutputNames = new ArrayList<>();
+ // One each for min, max and bloom filter
+ gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0));
+ gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1));
+ gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2));
+
+ int colPos = 0;
+ ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
+ for (int i = 0; i < gbOutputNames.size() - 1; i++) {
+ ExprNodeColumnDesc expr = new ExprNodeColumnDesc(key.getTypeInfo(),
+ gbOutputNames.get(colPos++), "", false);
+ rsValueCols.add(expr);
+ }
+
+ // Bloom Filter uses binary
+ ExprNodeColumnDesc colBFExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo,
+ gbOutputNames.get(colPos++), "", false);
+ rsValueCols.add(colBFExpr);
+
// Create the final Reduce Sink Operator
ReduceSinkDesc rsDescFinal = PlanUtils.getReduceSinkDesc(
new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false,
-1, 0, 1, Operation.NOT_ACID);
ReduceSinkOperator rsOpFinal = (ReduceSinkOperator)OperatorFactory.getAndMakeChild(
- rsDescFinal, new RowSchema(groupByOpFinal.getSchema()), groupByOpFinal);
+ rsDescFinal, new RowSchema(gb.getSchema()), gb);
+ Map<String, ExprNodeDesc> columnExprMap = new HashMap<>();
rsOpFinal.setColumnExprMap(columnExprMap);
- LOG.debug("DynamicMinMaxPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts);
- parseContext.getRsOpToTsOpMap().put(rsOpFinal, ts);
+ LOG.debug("DynamicSemiJoinPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts);
+ SemiJoinBranchInfo sjInfo = new SemiJoinBranchInfo(ts, isHint);
+ parseContext.getRsToSemiJoinBranchInfo().put(rsOpFinal, sjInfo);
// for explain purpose
- if (parseContext.getContext().getExplainConfig() != null
- && parseContext.getContext().getExplainConfig().isFormatted()) {
- List<String> outputOperators = new ArrayList<>();
+ if (parseContext.getContext().getExplainConfig() != null &&
+ parseContext.getContext().getExplainConfig().isFormatted()) {
+ List<String> outputOperators = rsOpFinal.getConf().getOutputOperators();
+ if (outputOperators == null) {
+ outputOperators = new ArrayList<>();
+ }
outputOperators.add(ts.getOperatorId());
- rsOpFinal.getConf().setOutputOperators(outputOperators);
}
// Save the info that is required at query time to resolve dynamic/runtime values.
@@ -666,10 +724,8 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
runtimeValuesInfo.setTableDesc(rsFinalTableDesc);
runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs);
runtimeValuesInfo.setColExprs(rsValueCols);
- runtimeValuesInfo.setTsColExpr(ctx.parent.getChildren().get(0));
+ runtimeValuesInfo.setTsColExpr(colExpr);
parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
-
- return true;
}
private Map<Node, Object> collectDynamicPruningConditions(ExprNodeDesc pred, NodeProcessorCtx ctx)
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
index 73a9b0f..d375d1b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
@@ -19,6 +19,8 @@
package org.apache.hadoop.hive.ql.optimizer.calcite.translator;
+import org.apache.hadoop.hive.ql.parse.*;
+
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -72,19 +74,8 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortExchange
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion;
-import org.apache.hadoop.hive.ql.parse.JoinCond;
-import org.apache.hadoop.hive.ql.parse.JoinType;
-import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec;
import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression;
import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
-import org.apache.hadoop.hive.ql.parse.PTFTranslator;
-import org.apache.hadoop.hive.ql.parse.ParseUtils;
-import org.apache.hadoop.hive.ql.parse.RowResolver;
-import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
-import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.parse.UnparseTranslator;
-import org.apache.hadoop.hive.ql.parse.WindowingComponentizer;
-import org.apache.hadoop.hive.ql.parse.WindowingSpec;
import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
@@ -348,6 +339,9 @@ public class HiveOpConverter {
// through Hive
String[] baseSrc = new String[joinRel.getInputs().size()];
String tabAlias = getHiveDerivedTableAlias();
+ Map<String, SemiJoinHint> semiJoinHints = semanticAnalyzer.parseSemiJoinHint(
+ semanticAnalyzer.getQB().getParseInfo().getHints());
+
// 1. Convert inputs
OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()];
List<Operator<?>> children = new ArrayList<Operator<?>>(joinRel.getInputs().size());
@@ -413,7 +407,7 @@ public class HiveOpConverter {
// 6. Generate Join operator
JoinOperator joinOp = genJoin(joinRel, joinExpressions, filterExpressions, children,
- baseSrc, tabAlias);
+ baseSrc, tabAlias, semiJoinHints);
// 7. Return result
return new OpAttr(tabAlias, newVcolsInCalcite, joinOp);
@@ -726,7 +720,7 @@ public class HiveOpConverter {
List<String> keepColNames) throws SemanticException {
// 1. Generate RS operator
// 1.1 Prune the tableNames, only count the tableNames that are not empty strings
- // as empty string in table aliases is only allowed for virtual columns.
+ // as empty string in table aliases is only allowed for virtual columns.
String tableAlias = null;
Set<String> tableNames = input.getSchema().getTableNames();
for (String tableName : tableNames) {
@@ -885,7 +879,8 @@ public class HiveOpConverter {
private static JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressions,
List<List<ExprNodeDesc>> filterExpressions, List<Operator<?>> children,
- String[] baseSrc, String tabAlias) throws SemanticException {
+ String[] baseSrc, String tabAlias, Map<String, SemiJoinHint> semiJoinHints)
+ throws SemanticException {
// 1. Extract join type
JoinCondDesc[] joinCondns;
@@ -1011,6 +1006,7 @@ public class HiveOpConverter {
// 4. We create the join operator with its descriptor
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, noOuterJoin, joinCondns,
filters, joinExpressions);
+ desc.setSemiJoinHints(semiJoinHints);
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(filterMap);
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index c97b3e7..d10b6bf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hive.ql.parse;
+import org.antlr.runtime.tree.Tree;
+
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
@@ -332,6 +334,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
skipCalcitePlan = true;
} else {
PreCboCtx cboCtx = (PreCboCtx) plannerCtx;
+ ASTNode oldHints = getQB().getParseInfo().getHints();
// Note: for now, we don't actually pass the queryForCbo to CBO, because
// it accepts qb, not AST, and can also access all the private stuff in
@@ -399,6 +402,15 @@ public class CalcitePlanner extends SemanticAnalyzer {
newAST = reAnalyzeCTASAfterCbo(newAST);
}
}
+ if (oldHints != null) {
+ if (getQB().getParseInfo().getHints() != null) {
+ LOG.warn("Hints are not null in the optimized tree; before CBO " + oldHints.dump()
+ + "; after CBO " + getQB().getParseInfo().getHints().dump());
+ } else {
+ LOG.debug("Propagating hints to QB: " + oldHints);
+ getQB().getParseInfo().setHints(oldHints);
+ }
+ }
Phase1Ctx ctx_1 = initPhase1Ctx();
if (!doPhase1(newAST, getQB(), ctx_1, null)) {
throw new RuntimeException("Couldn't do phase1 on CBO optimized query plan");
@@ -3462,6 +3474,24 @@ public class CalcitePlanner extends SemanticAnalyzer {
return selRel;
}
+ private void setQueryHints(QB qb) throws SemanticException {
+ QBParseInfo qbp = getQBParseInfo(qb);
+ String selClauseName = qbp.getClauseNames().iterator().next();
+ Tree selExpr0 = qbp.getSelForClause(selClauseName).getChild(0);
+
+ if (selExpr0.getType() != HiveParser.QUERY_HINT) return;
+ String hint = ctx.getTokenRewriteStream().toString(
+ selExpr0.getTokenStartIndex(), selExpr0.getTokenStopIndex());
+ LOG.debug("Handling query hints: " + hint);
+ ParseDriver pd = new ParseDriver();
+ try {
+ ASTNode hintNode = pd.parseHint(hint);
+ qbp.setHints((ASTNode) hintNode);
+ } catch (ParseException e) {
+ throw new SemanticException("failed to parse query hint: "+e.getMessage(), e);
+ }
+ }
+
/**
* NOTE: there can only be one select caluse since we don't handle multi
* destination insert.
@@ -3960,7 +3990,12 @@ public class CalcitePlanner extends SemanticAnalyzer {
throw new CalciteSemanticException("Unsupported", UnsupportedFeature.Others);
}
+
// 1.3 process join
+ // 1.3.1 process hints
+ setQueryHints(qb);
+
+ // 1.3.2 process the actual join
if (qb.getParseInfo().getJoinExpr() != null) {
srcRel = genJoinLogicalPlan(qb.getParseInfo().getJoinExpr(), aliasToRel);
} else {
[4/5] hive git commit: HIVE-16423: Add hint to enforce semi join
optimization (Deepak Jaiswal, reviewed by Jason Dere)
Posted by gu...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig
new file mode 100644
index 0000000..c97b3e7
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig
@@ -0,0 +1,4188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.parse;
+
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.UndeclaredThrowableException;
+import java.math.BigDecimal;
+import java.util.AbstractMap.SimpleEntry;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.antlr.runtime.ClassicToken;
+import org.antlr.runtime.CommonToken;
+import org.antlr.runtime.tree.TreeVisitor;
+import org.antlr.runtime.tree.TreeVisitorAction;
+import org.apache.calcite.adapter.druid.DruidQuery;
+import org.apache.calcite.adapter.druid.DruidRules;
+import org.apache.calcite.adapter.druid.DruidSchema;
+import org.apache.calcite.adapter.druid.DruidTable;
+import org.apache.calcite.adapter.druid.LocalInterval;
+import org.apache.calcite.config.CalciteConnectionConfigImpl;
+import org.apache.calcite.config.CalciteConnectionProperty;
+import org.apache.calcite.plan.RelOptCluster;
+import org.apache.calcite.plan.RelOptMaterialization;
+import org.apache.calcite.plan.RelOptPlanner;
+import org.apache.calcite.plan.RelOptRule;
+import org.apache.calcite.plan.RelOptSchema;
+import org.apache.calcite.plan.RelOptUtil;
+import org.apache.calcite.plan.RelTraitSet;
+import org.apache.calcite.plan.hep.HepMatchOrder;
+import org.apache.calcite.plan.hep.HepPlanner;
+import org.apache.calcite.plan.hep.HepProgram;
+import org.apache.calcite.plan.hep.HepProgramBuilder;
+import org.apache.calcite.rel.RelCollation;
+import org.apache.calcite.rel.RelCollationImpl;
+import org.apache.calcite.rel.RelCollations;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.Aggregate;
+import org.apache.calcite.rel.core.AggregateCall;
+import org.apache.calcite.rel.core.Filter;
+import org.apache.calcite.rel.core.JoinRelType;
+import org.apache.calcite.rel.core.SetOp;
+import org.apache.calcite.rel.core.TableScan;
+import org.apache.calcite.rel.metadata.CachingRelMetadataProvider;
+import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider;
+import org.apache.calcite.rel.metadata.DefaultRelMetadataProvider;
+import org.apache.calcite.rel.metadata.JaninoRelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.rel.rules.FilterMergeRule;
+import org.apache.calcite.rel.rules.JoinToMultiJoinRule;
+import org.apache.calcite.rel.rules.LoptOptimizeJoinRule;
+import org.apache.calcite.rel.rules.ProjectMergeRule;
+import org.apache.calcite.rel.rules.ProjectRemoveRule;
+import org.apache.calcite.rel.rules.SemiJoinFilterTransposeRule;
+import org.apache.calcite.rel.rules.SemiJoinJoinTransposeRule;
+import org.apache.calcite.rel.rules.SemiJoinProjectTransposeRule;
+import org.apache.calcite.rel.rules.UnionMergeRule;
+import org.apache.calcite.rel.type.RelDataType;
+import org.apache.calcite.rel.type.RelDataTypeFactory;
+import org.apache.calcite.rel.type.RelDataTypeField;
+import org.apache.calcite.rel.type.RelDataTypeImpl;
+import org.apache.calcite.rex.RexBuilder;
+import org.apache.calcite.rex.RexExecutor;
+import org.apache.calcite.rex.RexFieldCollation;
+import org.apache.calcite.rex.RexInputRef;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexUtil;
+import org.apache.calcite.rex.RexWindowBound;
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.sql.SqlAggFunction;
+import org.apache.calcite.sql.SqlCall;
+import org.apache.calcite.sql.SqlExplainLevel;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.SqlLiteral;
+import org.apache.calcite.sql.SqlNode;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.calcite.sql.SqlWindow;
+import org.apache.calcite.sql.parser.SqlParserPos;
+import org.apache.calcite.sql.type.SqlTypeName;
+import org.apache.calcite.tools.Frameworks;
+import org.apache.calcite.util.CompositeList;
+import org.apache.calcite.util.ImmutableBitSet;
+import org.apache.calcite.util.ImmutableIntList;
+import org.apache.calcite.util.Pair;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.conf.Constants;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.conf.HiveConf.StrictChecks;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.QueryProperties;
+import org.apache.hadoop.hive.ql.QueryState;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.log.PerfLogger;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSubquerySemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteViewSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveDefaultRelMetadataProvider;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HivePlannerContext;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRexExecutorImpl;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
+import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil;
+import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveAlgorithmsConf;
+import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveVolcanoPlanner;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateJoinTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateProjectMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregatePullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveExceptRewriteRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveExpandDistinctAggregatesRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterAggregateTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterJoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterProjectTSTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterProjectTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterSetOpTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterSortTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveInsertExchange4JoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveIntersectMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveIntersectRewriteRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinAddNotNullRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinCommuteRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinProjectTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinPushTransitivePredicatesRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinToMultiJoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePartitionPruneRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePreFilteringRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectFilterPullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectOverIntersectRemoveRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectSortTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveReduceExpressionsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveReduceExpressionsWithStatsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelDecorrelator;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelFieldTrimmer;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortLimitPullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortProjectTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortRemoveRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortUnionReduceRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSubQueryRemoveRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionPullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveWindowingFixRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveMaterializedViewFilterScanRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTBuilder;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.JoinCondTypeCheckProcFactory;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.JoinTypeCheckCtx;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.PlanModifierForReturnPath;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.RexNodeConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec;
+import org.apache.hadoop.hive.ql.parse.QBExpr.Opcode;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowType;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.HiveOperation;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableList.Builder;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import org.apache.calcite.config.CalciteConnectionConfig;
+
+public class CalcitePlanner extends SemanticAnalyzer {
+
+ private final AtomicInteger noColsMissingStats = new AtomicInteger(0);
+ private SemanticException semanticException;
+ private boolean runCBO = true;
+ private boolean disableSemJoinReordering = true;
+ private EnumSet<ExtendedCBOProfile> profilesCBO;
+
+ public CalcitePlanner(QueryState queryState) throws SemanticException {
+ super(queryState);
+ if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_ENABLED)) {
+ runCBO = false;
+ disableSemJoinReordering = false;
+ }
+ }
+
+ public void resetCalciteConfiguration() {
+ if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_ENABLED)) {
+ runCBO = true;
+ disableSemJoinReordering = true;
+ }
+ }
+
+ @Override
+ @SuppressWarnings("nls")
+ public void analyzeInternal(ASTNode ast) throws SemanticException {
+ if (runCBO) {
+ PreCboCtx cboCtx = new PreCboCtx();
+ super.analyzeInternal(ast, cboCtx);
+ } else {
+ super.analyzeInternal(ast);
+ }
+ }
+
+ /**
+ * This method is useful if we want to obtain the logical plan after being parsed and
+ * optimized by Calcite.
+ *
+ * @return the Calcite plan for the query, null if it could not be generated
+ */
+ public RelNode genLogicalPlan(ASTNode ast) throws SemanticException {
+ LOG.info("Starting generating logical plan");
+ PreCboCtx cboCtx = new PreCboCtx();
+ //change the location of position alias process here
+ processPositionAlias(ast);
+ if (!genResolvedParseTree(ast, cboCtx)) {
+ return null;
+ }
+ ASTNode queryForCbo = ast;
+ if (cboCtx.type == PreCboCtx.Type.CTAS || cboCtx.type == PreCboCtx.Type.VIEW) {
+ queryForCbo = cboCtx.nodeOfInterest; // nodeOfInterest is the query
+ }
+ runCBO = canCBOHandleAst(queryForCbo, getQB(), cboCtx);
+ if (!runCBO) {
+ return null;
+ }
+ profilesCBO = obtainCBOProfiles(queryProperties);
+ disableJoinMerge = true;
+ final RelNode resPlan = logicalPlan();
+ LOG.info("Finished generating logical plan");
+ return resPlan;
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ Operator genOPTree(ASTNode ast, PlannerContext plannerCtx) throws SemanticException {
+ Operator sinkOp = null;
+ boolean skipCalcitePlan = false;
+
+ if (!runCBO) {
+ skipCalcitePlan = true;
+ } else {
+ PreCboCtx cboCtx = (PreCboCtx) plannerCtx;
+
+ // Note: for now, we don't actually pass the queryForCbo to CBO, because
+ // it accepts qb, not AST, and can also access all the private stuff in
+ // SA. We rely on the fact that CBO ignores the unknown tokens (create
+ // table, destination), so if the query is otherwise ok, it is as if we
+ // did remove those and gave CBO the proper AST. That is kinda hacky.
+ ASTNode queryForCbo = ast;
+ if (cboCtx.type == PreCboCtx.Type.CTAS || cboCtx.type == PreCboCtx.Type.VIEW) {
+ queryForCbo = cboCtx.nodeOfInterest; // nodeOfInterest is the query
+ }
+ runCBO = canCBOHandleAst(queryForCbo, getQB(), cboCtx);
+ if (queryProperties.hasMultiDestQuery()) {
+ handleMultiDestQuery(ast, cboCtx);
+ }
+
+ if (runCBO) {
+ profilesCBO = obtainCBOProfiles(queryProperties);
+
+ disableJoinMerge = true;
+ boolean reAnalyzeAST = false;
+ final boolean materializedView = getQB().isMaterializedView();
+
+ try {
+ if (this.conf.getBoolVar(HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) {
+ if (cboCtx.type == PreCboCtx.Type.VIEW && !materializedView) {
+ throw new SemanticException("Create view is not supported in cbo return path.");
+ }
+ sinkOp = getOptimizedHiveOPDag();
+ LOG.info("CBO Succeeded; optimized logical plan.");
+ this.ctx.setCboInfo("Plan optimized by CBO.");
+ this.ctx.setCboSucceeded(true);
+ } else {
+ // 1. Gen Optimized AST
+ ASTNode newAST = getOptimizedAST();
+
+ // 1.1. Fix up the query for insert/ctas/materialized views
+ newAST = fixUpAfterCbo(ast, newAST, cboCtx);
+
+ // 2. Regen OP plan from optimized AST
+ if (cboCtx.type == PreCboCtx.Type.VIEW && !materializedView) {
+ try {
+ handleCreateViewDDL(newAST);
+ } catch (SemanticException e) {
+ throw new CalciteViewSemanticException(e.getMessage());
+ }
+ } else {
+ init(false);
+ if (cboCtx.type == PreCboCtx.Type.VIEW && materializedView) {
+ // Redo create-table/view analysis, because it's not part of
+ // doPhase1.
+ // Use the REWRITTEN AST
+ setAST(newAST);
+ newAST = reAnalyzeViewAfterCbo(newAST);
+ // Store text of the ORIGINAL QUERY
+ String originalText = ctx.getTokenRewriteStream().toString(
+ cboCtx.nodeOfInterest.getTokenStartIndex(),
+ cboCtx.nodeOfInterest.getTokenStopIndex());
+ createVwDesc.setViewOriginalText(originalText);
+ viewSelect = newAST;
+ viewsExpanded = new ArrayList<>();
+ viewsExpanded.add(createVwDesc.getViewName());
+ } else if (cboCtx.type == PreCboCtx.Type.CTAS) {
+ // CTAS
+ setAST(newAST);
+ newAST = reAnalyzeCTASAfterCbo(newAST);
+ }
+ }
+ Phase1Ctx ctx_1 = initPhase1Ctx();
+ if (!doPhase1(newAST, getQB(), ctx_1, null)) {
+ throw new RuntimeException("Couldn't do phase1 on CBO optimized query plan");
+ }
+ // unfortunately making prunedPartitions immutable is not possible
+ // here with SemiJoins not all tables are costed in CBO, so their
+ // PartitionList is not evaluated until the run phase.
+ getMetaData(getQB());
+
+ disableJoinMerge = defaultJoinMerge;
+ sinkOp = genPlan(getQB());
+ LOG.info("CBO Succeeded; optimized logical plan.");
+ this.ctx.setCboInfo("Plan optimized by CBO.");
+ this.ctx.setCboSucceeded(true);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(newAST.dump());
+ }
+ }
+ } catch (Exception e) {
+ boolean isMissingStats = noColsMissingStats.get() > 0;
+ if (isMissingStats) {
+ LOG.error("CBO failed due to missing column stats (see previous errors), skipping CBO");
+ this.ctx
+ .setCboInfo("Plan not optimized by CBO due to missing statistics. Please check log for more details.");
+ } else {
+ LOG.error("CBO failed, skipping CBO. ", e);
+ if (e instanceof CalciteSemanticException) {
+ CalciteSemanticException calciteSemanticException = (CalciteSemanticException) e;
+ UnsupportedFeature unsupportedFeature = calciteSemanticException
+ .getUnsupportedFeature();
+ if (unsupportedFeature != null) {
+ this.ctx.setCboInfo("Plan not optimized by CBO due to missing feature ["
+ + unsupportedFeature + "].");
+ } else {
+ this.ctx.setCboInfo("Plan not optimized by CBO.");
+ }
+ } else {
+ this.ctx.setCboInfo("Plan not optimized by CBO.");
+ }
+ }
+ if( e instanceof CalciteSubquerySemanticException) {
+ // non-cbo path retries to execute subqueries and throws completely different exception/error
+ // to eclipse the original error message
+ // so avoid executing subqueries on non-cbo
+ throw new SemanticException(e);
+ }
+ else if( e instanceof CalciteViewSemanticException) {
+ // non-cbo path retries to execute create view and
+ // we believe it will throw the same error message
+ throw new SemanticException(e);
+ }
+ else if (!conf.getBoolVar(ConfVars.HIVE_IN_TEST) || isMissingStats
+ || e instanceof CalciteSemanticException ) {
+ reAnalyzeAST = true;
+ } else if (e instanceof SemanticException) {
+ // although, its likely to be a valid exception, we will retry
+ // with cbo off anyway.
+ reAnalyzeAST = true;
+ } else if (e instanceof RuntimeException) {
+ throw (RuntimeException) e;
+ } else {
+ throw new SemanticException(e);
+ }
+ } finally {
+ runCBO = false;
+ disableJoinMerge = defaultJoinMerge;
+ disableSemJoinReordering = false;
+ if (reAnalyzeAST) {
+ init(true);
+ prunedPartitions.clear();
+ // Assumption: At this point Parse Tree gen & resolution will always
+ // be true (since we started out that way).
+ super.genResolvedParseTree(ast, new PlannerContext());
+ skipCalcitePlan = true;
+ }
+ }
+ } else {
+ this.ctx.setCboInfo("Plan not optimized by CBO.");
+ skipCalcitePlan = true;
+ }
+ }
+
+ if (skipCalcitePlan) {
+ sinkOp = super.genOPTree(ast, plannerCtx);
+ }
+
+ return sinkOp;
+ }
+
+ private void handleCreateViewDDL(ASTNode newAST) throws SemanticException {
+ saveViewDefinition();
+ String originalText = createVwDesc.getViewOriginalText();
+ String expandedText = createVwDesc.getViewExpandedText();
+ List<FieldSchema> schema = createVwDesc.getSchema();
+ List<FieldSchema> partitionColumns = createVwDesc.getPartCols();
+ init(false);
+ setAST(newAST);
+ newAST = reAnalyzeViewAfterCbo(newAST);
+ createVwDesc.setViewOriginalText(originalText);
+ createVwDesc.setViewExpandedText(expandedText);
+ createVwDesc.setSchema(schema);
+ createVwDesc.setPartCols(partitionColumns);
+ }
+
+ /*
+ * Tries to optimize FROM clause of multi-insert. No attempt to optimize insert clauses of the query.
+ * Returns true if rewriting is successful, false otherwise.
+ */
+ private void handleMultiDestQuery(ASTNode ast, PreCboCtx cboCtx) throws SemanticException {
+ // Not supported by CBO
+ if (!runCBO) {
+ return;
+ }
+ // Currently, we only optimized the query the content of the FROM clause
+ // for multi-insert queries. Thus, nodeOfInterest is the FROM clause
+ if (isJoinToken(cboCtx.nodeOfInterest)) {
+ // Join clause: rewriting is needed
+ ASTNode subq = rewriteASTForMultiInsert(ast, cboCtx.nodeOfInterest);
+ if (subq != null) {
+ // We could rewrite into a subquery
+ cboCtx.nodeOfInterest = (ASTNode) subq.getChild(0);
+ QB newQB = new QB(null, "", false);
+ Phase1Ctx ctx_1 = initPhase1Ctx();
+ doPhase1(cboCtx.nodeOfInterest, newQB, ctx_1, null);
+ setQB(newQB);
+ getMetaData(getQB());
+ } else {
+ runCBO = false;
+ }
+ } else if (cboCtx.nodeOfInterest.getToken().getType() == HiveParser.TOK_SUBQUERY) {
+ // Subquery: no rewriting needed
+ ASTNode subq = cboCtx.nodeOfInterest;
+ // First child is subquery, second child is alias
+ // We set the node of interest and QB to the subquery
+ // We do not need to generate the QB again, but rather we use it directly
+ cboCtx.nodeOfInterest = (ASTNode) subq.getChild(0);
+ String subQAlias = unescapeIdentifier(subq.getChild(1).getText());
+ final QB newQB = getQB().getSubqForAlias(subQAlias).getQB();
+ newQB.getParseInfo().setAlias("");
+ newQB.getParseInfo().setIsSubQ(false);
+ setQB(newQB);
+ } else {
+ // No need to run CBO (table ref or virtual table) or not supported
+ runCBO = false;
+ }
+ }
+
+ private ASTNode rewriteASTForMultiInsert(ASTNode query, ASTNode nodeOfInterest) {
+ // 1. gather references from original query
+ // This is a map from aliases to references.
+ // We keep all references as we will need to modify them after creating
+ // the subquery
+ final Multimap<String, Object> aliasNodes = ArrayListMultimap.create();
+ // To know if we need to bail out
+ final AtomicBoolean notSupported = new AtomicBoolean(false);
+ TreeVisitorAction action = new TreeVisitorAction() {
+ @Override
+ public Object pre(Object t) {
+ if (!notSupported.get()) {
+ if (ParseDriver.adaptor.getType(t) == HiveParser.TOK_ALLCOLREF) {
+ // TODO: this is a limitation of the AST rewriting approach that we will
+ // not be able to overcome till proper integration of full multi-insert
+ // queries with Calcite is implemented.
+ // The current rewriting gather references from insert clauses and then
+ // updates them with the new subquery references. However, if insert
+ // clauses use * or tab.*, we cannot resolve the columns that we are
+ // referring to. Thus, we just bail out and those queries will not be
+ // currently optimized by Calcite.
+ // An example of such query is:
+ // FROM T_A a LEFT JOIN T_B b ON a.id = b.id
+ // INSERT OVERWRITE TABLE join_result_1
+ // SELECT a.*, b.*
+ // INSERT OVERWRITE TABLE join_result_3
+ // SELECT a.*, b.*;
+ notSupported.set(true);
+ } else if (ParseDriver.adaptor.getType(t) == HiveParser.DOT) {
+ Object c = ParseDriver.adaptor.getChild(t, 0);
+ if (c != null && ParseDriver.adaptor.getType(c) == HiveParser.TOK_TABLE_OR_COL) {
+ aliasNodes.put(((ASTNode) t).toStringTree(), t);
+ }
+ } else if (ParseDriver.adaptor.getType(t) == HiveParser.TOK_TABLE_OR_COL) {
+ Object p = ParseDriver.adaptor.getParent(t);
+ if (p == null || ParseDriver.adaptor.getType(p) != HiveParser.DOT) {
+ aliasNodes.put(((ASTNode) t).toStringTree(), t);
+ }
+ }
+ }
+ return t;
+ }
+ @Override
+ public Object post(Object t) {
+ return t;
+ }
+ };
+ TreeVisitor tv = new TreeVisitor(ParseDriver.adaptor);
+ // We will iterate through the children: if it is an INSERT, we will traverse
+ // the subtree to gather the references
+ for (int i = 0; i < query.getChildCount(); i++) {
+ ASTNode child = (ASTNode) query.getChild(i);
+ if (ParseDriver.adaptor.getType(child) != HiveParser.TOK_INSERT) {
+ // If it is not an INSERT, we do not need to anything
+ continue;
+ }
+ tv.visit(child, action);
+ }
+ if (notSupported.get()) {
+ // Bail out
+ return null;
+ }
+ // 2. rewrite into query
+ // TOK_QUERY
+ // TOK_FROM
+ // join
+ // TOK_INSERT
+ // TOK_DESTINATION
+ // TOK_DIR
+ // TOK_TMP_FILE
+ // TOK_SELECT
+ // refs
+ ASTNode from = new ASTNode(new CommonToken(HiveParser.TOK_FROM, "TOK_FROM"));
+ from.addChild((ASTNode) ParseDriver.adaptor.dupTree(nodeOfInterest));
+ ASTNode destination = new ASTNode(new CommonToken(HiveParser.TOK_DESTINATION, "TOK_DESTINATION"));
+ ASTNode dir = new ASTNode(new CommonToken(HiveParser.TOK_DIR, "TOK_DIR"));
+ ASTNode tmpFile = new ASTNode(new CommonToken(HiveParser.TOK_TMP_FILE, "TOK_TMP_FILE"));
+ dir.addChild(tmpFile);
+ destination.addChild(dir);
+ ASTNode select = new ASTNode(new CommonToken(HiveParser.TOK_SELECT, "TOK_SELECT"));
+ int num = 0;
+ for (Collection<Object> selectIdentifier : aliasNodes.asMap().values()) {
+ Iterator<Object> it = selectIdentifier.iterator();
+ ASTNode node = (ASTNode) it.next();
+ // Add select expression
+ ASTNode selectExpr = new ASTNode(new CommonToken(HiveParser.TOK_SELEXPR, "TOK_SELEXPR"));
+ selectExpr.addChild((ASTNode) ParseDriver.adaptor.dupTree(node)); // Identifier
+ String colAlias = "col" + num;
+ selectExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias))); // Alias
+ select.addChild(selectExpr);
+ // Rewrite all INSERT references (all the node values for this key)
+ ASTNode colExpr = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL"));
+ colExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias)));
+ replaceASTChild(node, colExpr);
+ while (it.hasNext()) {
+ // Loop to rewrite rest of INSERT references
+ node = (ASTNode) it.next();
+ colExpr = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL"));
+ colExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias)));
+ replaceASTChild(node, colExpr);
+ }
+ num++;
+ }
+ ASTNode insert = new ASTNode(new CommonToken(HiveParser.TOK_INSERT, "TOK_INSERT"));
+ insert.addChild(destination);
+ insert.addChild(select);
+ ASTNode newQuery = new ASTNode(new CommonToken(HiveParser.TOK_QUERY, "TOK_QUERY"));
+ newQuery.addChild(from);
+ newQuery.addChild(insert);
+ // 3. create subquery
+ ASTNode subq = new ASTNode(new CommonToken(HiveParser.TOK_SUBQUERY, "TOK_SUBQUERY"));
+ subq.addChild(newQuery);
+ subq.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, "subq")));
+ replaceASTChild(nodeOfInterest, subq);
+ // 4. return subquery
+ return subq;
+ }
+
+ /**
+ * Can CBO handle the given AST?
+ *
+ * @param ast
+ * Top level AST
+ * @param qb
+ * top level QB corresponding to the AST
+ * @param cboCtx
+ * @param semAnalyzer
+ * @return boolean
+ *
+ * Assumption:<br>
+ * If top level QB is query then everything below it must also be
+ * Query.
+ */
+ boolean canCBOHandleAst(ASTNode ast, QB qb, PreCboCtx cboCtx) {
+ int root = ast.getToken().getType();
+ boolean needToLogMessage = STATIC_LOG.isInfoEnabled();
+ boolean isSupportedRoot = root == HiveParser.TOK_QUERY || root == HiveParser.TOK_EXPLAIN
+ || qb.isCTAS() || qb.isMaterializedView();
+ // Queries without a source table currently are not supported by CBO
+ boolean isSupportedType = (qb.getIsQuery() && !qb.containsQueryWithoutSourceTable())
+ || qb.isCTAS() || qb.isMaterializedView() || cboCtx.type == PreCboCtx.Type.INSERT
+ || cboCtx.type == PreCboCtx.Type.MULTI_INSERT;
+ boolean noBadTokens = HiveCalciteUtil.validateASTForUnsupportedTokens(ast);
+ boolean result = isSupportedRoot && isSupportedType && noBadTokens;
+
+ if (!result) {
+ if (needToLogMessage) {
+ String msg = "";
+ if (!isSupportedRoot) {
+ msg += "doesn't have QUERY or EXPLAIN as root and not a CTAS; ";
+ }
+ if (!isSupportedType) {
+ msg += "is not a query with at least one source table "
+ + " or there is a subquery without a source table, or CTAS, or insert; ";
+ }
+ if (!noBadTokens) {
+ msg += "has unsupported tokens; ";
+ }
+
+ if (msg.isEmpty()) {
+ msg += "has some unspecified limitations; ";
+ }
+ STATIC_LOG.info("Not invoking CBO because the statement "
+ + msg.substring(0, msg.length() - 2));
+ }
+ return false;
+ }
+ // Now check QB in more detail. canHandleQbForCbo returns null if query can
+ // be handled.
+ String msg = CalcitePlanner.canHandleQbForCbo(queryProperties, conf, true, needToLogMessage, qb);
+ if (msg == null) {
+ return true;
+ }
+ if (needToLogMessage) {
+ STATIC_LOG.info("Not invoking CBO because the statement "
+ + msg.substring(0, msg.length() - 2));
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether Calcite can handle the query.
+ *
+ * @param queryProperties
+ * @param conf
+ * @param topLevelQB
+ * Does QB corresponds to top most query block?
+ * @param verbose
+ * Whether return value should be verbose in case of failure.
+ * @return null if the query can be handled; non-null reason string if it
+ * cannot be.
+ *
+ * Assumption:<br>
+ * 1. If top level QB is query then everything below it must also be
+ * Query<br>
+ * 2. Nested Subquery will return false for qbToChk.getIsQuery()
+ */
+ static String canHandleQbForCbo(QueryProperties queryProperties, HiveConf conf,
+ boolean topLevelQB, boolean verbose, QB qb) {
+
+ if (!queryProperties.hasClusterBy() && !queryProperties.hasDistributeBy()
+ && !queryProperties.hasSortBy() && !queryProperties.hasPTF() && !queryProperties.usesScript()
+ && !queryProperties.hasLateralViews()) {
+ // Ok to run CBO.
+ return null;
+ }
+
+ // Not ok to run CBO, build error message.
+ String msg = "";
+ if (verbose) {
+ if (queryProperties.hasClusterBy())
+ msg += "has cluster by; ";
+ if (queryProperties.hasDistributeBy())
+ msg += "has distribute by; ";
+ if (queryProperties.hasSortBy())
+ msg += "has sort by; ";
+ if (queryProperties.hasPTF())
+ msg += "has PTF; ";
+ if (queryProperties.usesScript())
+ msg += "uses scripts; ";
+ if (queryProperties.hasLateralViews())
+ msg += "has lateral views; ";
+
+ if (msg.isEmpty())
+ msg += "has some unspecified limitations; ";
+ }
+ return msg;
+ }
+
+ /* This method inserts the right profiles into profiles CBO depending
+ * on the query characteristics. */
+ private static EnumSet<ExtendedCBOProfile> obtainCBOProfiles(QueryProperties queryProperties) {
+ EnumSet<ExtendedCBOProfile> profilesCBO = EnumSet.noneOf(ExtendedCBOProfile.class);
+ // If the query contains more than one join
+ if (queryProperties.getJoinCount() > 1) {
+ profilesCBO.add(ExtendedCBOProfile.JOIN_REORDERING);
+ }
+ // If the query contains windowing processing
+ if (queryProperties.hasWindowing()) {
+ profilesCBO.add(ExtendedCBOProfile.WINDOWING_POSTPROCESSING);
+ }
+ return profilesCBO;
+ }
+
+ @Override
+ boolean isCBOExecuted() {
+ return runCBO;
+ }
+
+ @Override
+ boolean continueJoinMerge() {
+ return !(runCBO && disableSemJoinReordering);
+ }
+
+ @Override
+ Table materializeCTE(String cteName, CTEClause cte) throws HiveException {
+
+ ASTNode createTable = new ASTNode(new ClassicToken(HiveParser.TOK_CREATETABLE));
+
+ ASTNode tableName = new ASTNode(new ClassicToken(HiveParser.TOK_TABNAME));
+ tableName.addChild(new ASTNode(new ClassicToken(HiveParser.Identifier, cteName)));
+
+ ASTNode temporary = new ASTNode(new ClassicToken(HiveParser.KW_TEMPORARY, MATERIALIZATION_MARKER));
+
+ createTable.addChild(tableName);
+ createTable.addChild(temporary);
+ createTable.addChild(cte.cteNode);
+
+ CalcitePlanner analyzer = new CalcitePlanner(queryState);
+ analyzer.initCtx(ctx);
+ analyzer.init(false);
+
+ // should share cte contexts
+ analyzer.aliasToCTEs.putAll(aliasToCTEs);
+
+ HiveOperation operation = queryState.getHiveOperation();
+ try {
+ analyzer.analyzeInternal(createTable);
+ } finally {
+ queryState.setCommandType(operation);
+ }
+
+ Table table = analyzer.tableDesc.toTable(conf);
+ Path location = table.getDataLocation();
+ try {
+ location.getFileSystem(conf).mkdirs(location);
+ } catch (IOException e) {
+ throw new HiveException(e);
+ }
+ table.setMaterializedTable(true);
+
+ LOG.info(cteName + " will be materialized into " + location);
+ cte.table = table;
+ cte.source = analyzer;
+
+ ctx.addMaterializedTable(cteName, table);
+ // For CalcitePlanner, store qualified name too
+ ctx.addMaterializedTable(table.getDbName() + "." + table.getTableName(), table);
+
+ return table;
+ }
+
+ @Override
+ String fixCtasColumnName(String colName) {
+ if (runCBO) {
+ int lastDot = colName.lastIndexOf('.');
+ if (lastDot < 0)
+ return colName; // alias is not fully qualified
+ String nqColumnName = colName.substring(lastDot + 1);
+ STATIC_LOG.debug("Replacing " + colName + " (produced by CBO) by " + nqColumnName);
+ return nqColumnName;
+ }
+
+ return super.fixCtasColumnName(colName);
+ }
+
+ /**
+ * The context that doPhase1 uses to populate information pertaining to CBO
+ * (currently, this is used for CTAS and insert-as-select).
+ */
+ static class PreCboCtx extends PlannerContext {
+ enum Type {
+ NONE, INSERT, MULTI_INSERT, CTAS, VIEW, UNEXPECTED
+ }
+
+ private ASTNode nodeOfInterest;
+ private Type type = Type.NONE;
+
+ private void set(Type type, ASTNode ast) {
+ if (this.type != Type.NONE) {
+ STATIC_LOG.warn("Setting " + type + " when already " + this.type + "; node " + ast.dump()
+ + " vs old node " + nodeOfInterest.dump());
+ this.type = Type.UNEXPECTED;
+ return;
+ }
+ this.type = type;
+ this.nodeOfInterest = ast;
+ }
+
+ @Override
+ void setCTASToken(ASTNode child) {
+ set(PreCboCtx.Type.CTAS, child);
+ }
+
+ @Override
+ void setViewToken(ASTNode child) {
+ set(PreCboCtx.Type.VIEW, child);
+ }
+
+ @Override
+ void setInsertToken(ASTNode ast, boolean isTmpFileDest) {
+ if (!isTmpFileDest) {
+ set(PreCboCtx.Type.INSERT, ast);
+ }
+ }
+
+ @Override
+ void setMultiInsertToken(ASTNode child) {
+ set(PreCboCtx.Type.MULTI_INSERT, child);
+ }
+
+ @Override
+ void resetToken() {
+ this.type = Type.NONE;
+ this.nodeOfInterest = null;
+ }
+ }
+
+ ASTNode fixUpAfterCbo(ASTNode originalAst, ASTNode newAst, PreCboCtx cboCtx)
+ throws SemanticException {
+ switch (cboCtx.type) {
+
+ case NONE:
+ // nothing to do
+ return newAst;
+
+ case CTAS:
+ case VIEW: {
+ // Patch the optimized query back into original CTAS AST, replacing the
+ // original query.
+ replaceASTChild(cboCtx.nodeOfInterest, newAst);
+ return originalAst;
+ }
+
+ case INSERT: {
+ // We need to patch the dest back to original into new query.
+ // This makes assumptions about the structure of the AST.
+ ASTNode newDest = new ASTSearcher().simpleBreadthFirstSearch(newAst, HiveParser.TOK_QUERY,
+ HiveParser.TOK_INSERT, HiveParser.TOK_DESTINATION);
+ if (newDest == null) {
+ LOG.error("Cannot find destination after CBO; new ast is " + newAst.dump());
+ throw new SemanticException("Cannot find destination after CBO");
+ }
+ replaceASTChild(newDest, cboCtx.nodeOfInterest);
+ return newAst;
+ }
+
+ case MULTI_INSERT: {
+ // Patch the optimized query back into original FROM clause.
+ replaceASTChild(cboCtx.nodeOfInterest, newAst);
+ return originalAst;
+ }
+
+ default:
+ throw new AssertionError("Unexpected type " + cboCtx.type);
+ }
+ }
+
+ ASTNode reAnalyzeCTASAfterCbo(ASTNode newAst) throws SemanticException {
+ // analyzeCreateTable uses this.ast, but doPhase1 doesn't, so only reset it
+ // here.
+ newAst = analyzeCreateTable(newAst, getQB(), null);
+ if (newAst == null) {
+ LOG.error("analyzeCreateTable failed to initialize CTAS after CBO;" + " new ast is "
+ + getAST().dump());
+ throw new SemanticException("analyzeCreateTable failed to initialize CTAS after CBO");
+ }
+ return newAst;
+ }
+
+ ASTNode reAnalyzeViewAfterCbo(ASTNode newAst) throws SemanticException {
+ // analyzeCreateView uses this.ast, but doPhase1 doesn't, so only reset it
+ // here.
+ newAst = analyzeCreateView(newAst, getQB(), null);
+ if (newAst == null) {
+ LOG.error("analyzeCreateTable failed to initialize materialized view after CBO;" + " new ast is "
+ + getAST().dump());
+ throw new SemanticException("analyzeCreateTable failed to initialize materialized view after CBO");
+ }
+ return newAst;
+ }
+
+
+ public static class ASTSearcher {
+ private final LinkedList<ASTNode> searchQueue = new LinkedList<ASTNode>();
+
+ /**
+ * Performs breadth-first search of the AST for a nested set of tokens. Tokens
+ * don't have to be each others' direct children, they can be separated by
+ * layers of other tokens. For each token in the list, the first one found is
+ * matched and there's no backtracking; thus, if AST has multiple instances of
+ * some token, of which only one matches, it is not guaranteed to be found. We
+ * use this for simple things. Not thread-safe - reuses searchQueue.
+ */
+ public ASTNode simpleBreadthFirstSearch(ASTNode ast, int... tokens) {
+ searchQueue.clear();
+ searchQueue.add(ast);
+ for (int i = 0; i < tokens.length; ++i) {
+ boolean found = false;
+ int token = tokens[i];
+ while (!searchQueue.isEmpty() && !found) {
+ ASTNode next = searchQueue.poll();
+ found = next.getType() == token;
+ if (found) {
+ if (i == tokens.length - 1)
+ return next;
+ searchQueue.clear();
+ }
+ for (int j = 0; j < next.getChildCount(); ++j) {
+ searchQueue.add((ASTNode) next.getChild(j));
+ }
+ }
+ if (!found)
+ return null;
+ }
+ return null;
+ }
+
+ public ASTNode depthFirstSearch(ASTNode ast, int token) {
+ searchQueue.clear();
+ searchQueue.add(ast);
+ while (!searchQueue.isEmpty()) {
+ ASTNode next = searchQueue.poll();
+ if (next.getType() == token) return next;
+ for (int j = 0; j < next.getChildCount(); ++j) {
+ searchQueue.add((ASTNode) next.getChild(j));
+ }
+ }
+ return null;
+ }
+
+ public ASTNode simpleBreadthFirstSearchAny(ASTNode ast, int... tokens) {
+ searchQueue.clear();
+ searchQueue.add(ast);
+ while (!searchQueue.isEmpty()) {
+ ASTNode next = searchQueue.poll();
+ for (int i = 0; i < tokens.length; ++i) {
+ if (next.getType() == tokens[i]) return next;
+ }
+ for (int i = 0; i < next.getChildCount(); ++i) {
+ searchQueue.add((ASTNode) next.getChild(i));
+ }
+ }
+ return null;
+ }
+
+ public void reset() {
+ searchQueue.clear();
+ }
+ }
+
+ private static void replaceASTChild(ASTNode child, ASTNode newChild) {
+ ASTNode parent = (ASTNode) child.parent;
+ int childIndex = child.childIndex;
+ parent.deleteChild(childIndex);
+ parent.insertChild(childIndex, newChild);
+ }
+
+ /**
+ * Get optimized logical plan for the given QB tree in the semAnalyzer.
+ *
+ * @return
+ * @throws SemanticException
+ */
+ RelNode logicalPlan() throws SemanticException {
+ RelNode optimizedOptiqPlan = null;
+
+ CalcitePlannerAction calcitePlannerAction = null;
+ if (this.columnAccessInfo == null) {
+ this.columnAccessInfo = new ColumnAccessInfo();
+ }
+ calcitePlannerAction = new CalcitePlannerAction(prunedPartitions, this.columnAccessInfo);
+
+ try {
+ optimizedOptiqPlan = Frameworks.withPlanner(calcitePlannerAction, Frameworks
+ .newConfigBuilder().typeSystem(new HiveTypeSystemImpl()).build());
+ } catch (Exception e) {
+ rethrowCalciteException(e);
+ throw new AssertionError("rethrowCalciteException didn't throw for " + e.getMessage());
+ }
+ return optimizedOptiqPlan;
+ }
+
+ /**
+ * Get Optimized AST for the given QB tree in the semAnalyzer.
+ *
+ * @return Optimized operator tree translated in to Hive AST
+ * @throws SemanticException
+ */
+ ASTNode getOptimizedAST() throws SemanticException {
+ RelNode optimizedOptiqPlan = logicalPlan();
+ ASTNode optiqOptimizedAST = ASTConverter.convert(optimizedOptiqPlan, resultSchema,
+ HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_COLUMN_ALIGNMENT));
+ return optiqOptimizedAST;
+ }
+
+ /**
+ * Get Optimized Hive Operator DAG for the given QB tree in the semAnalyzer.
+ *
+ * @return Optimized Hive operator tree
+ * @throws SemanticException
+ */
+ Operator getOptimizedHiveOPDag() throws SemanticException {
+ RelNode optimizedOptiqPlan = null;
+ CalcitePlannerAction calcitePlannerAction = null;
+ if (this.columnAccessInfo == null) {
+ this.columnAccessInfo = new ColumnAccessInfo();
+ }
+ calcitePlannerAction = new CalcitePlannerAction(prunedPartitions, this.columnAccessInfo);
+
+ try {
+ optimizedOptiqPlan = Frameworks.withPlanner(calcitePlannerAction, Frameworks
+ .newConfigBuilder().typeSystem(new HiveTypeSystemImpl()).build());
+ } catch (Exception e) {
+ rethrowCalciteException(e);
+ throw new AssertionError("rethrowCalciteException didn't throw for " + e.getMessage());
+ }
+
+ RelNode modifiedOptimizedOptiqPlan = PlanModifierForReturnPath.convertOpTree(
+ optimizedOptiqPlan, resultSchema, this.getQB().getTableDesc() != null);
+
+ LOG.debug("Translating the following plan:\n" + RelOptUtil.toString(modifiedOptimizedOptiqPlan));
+ Operator<?> hiveRoot = new HiveOpConverter(this, conf, unparseTranslator, topOps)
+ .convert(modifiedOptimizedOptiqPlan);
+ RowResolver hiveRootRR = genRowResolver(hiveRoot, getQB());
+ opParseCtx.put(hiveRoot, new OpParseContext(hiveRootRR));
+ String dest = getQB().getParseInfo().getClauseNames().iterator().next();
+ if (getQB().getParseInfo().getDestSchemaForClause(dest) != null
+ && this.getQB().getTableDesc() == null) {
+ Operator<?> selOp = handleInsertStatement(dest, hiveRoot, hiveRootRR, getQB());
+ return genFileSinkPlan(dest, getQB(), selOp);
+ } else {
+ return genFileSinkPlan(dest, getQB(), hiveRoot);
+ }
+ }
+
+ // This function serves as the wrapper of handleInsertStatementSpec in
+ // SemanticAnalyzer
+ Operator<?> handleInsertStatement(String dest, Operator<?> input, RowResolver inputRR, QB qb)
+ throws SemanticException {
+ ArrayList<ExprNodeDesc> colList = new ArrayList<ExprNodeDesc>();
+ ArrayList<ColumnInfo> columns = inputRR.getColumnInfos();
+ for (int i = 0; i < columns.size(); i++) {
+ ColumnInfo col = columns.get(i);
+ colList.add(new ExprNodeColumnDesc(col));
+ }
+ ASTNode selExprList = qb.getParseInfo().getSelForClause(dest);
+
+ RowResolver out_rwsch = handleInsertStatementSpec(colList, dest, inputRR, inputRR, qb,
+ selExprList);
+
+ ArrayList<String> columnNames = new ArrayList<String>();
+ Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
+ for (int i = 0; i < colList.size(); i++) {
+ String outputCol = getColumnInternalName(i);
+ colExprMap.put(outputCol, colList.get(i));
+ columnNames.add(outputCol);
+ }
+ Operator<?> output = putOpInsertMap(OperatorFactory.getAndMakeChild(new SelectDesc(colList,
+ columnNames), new RowSchema(out_rwsch.getColumnInfos()), input), out_rwsch);
+ output.setColumnExprMap(colExprMap);
+ return output;
+ }
+
+ /***
+ * Unwraps Calcite Invocation exceptions coming meta data provider chain and
+ * obtains the real cause.
+ *
+ * @param Exception
+ */
+ private void rethrowCalciteException(Exception e) throws SemanticException {
+ Throwable first = (semanticException != null) ? semanticException : e, current = first, cause = current
+ .getCause();
+ while (cause != null) {
+ Throwable causeOfCause = cause.getCause();
+ if (current == first && causeOfCause == null && isUselessCause(first)) {
+ // "cause" is a root cause, and "e"/"first" is a useless
+ // exception it's wrapped in.
+ first = cause;
+ break;
+ } else if (causeOfCause != null && isUselessCause(cause)
+ && ExceptionHelper.resetCause(current, causeOfCause)) {
+ // "cause" was a useless intermediate cause and was replace it
+ // with its own cause.
+ cause = causeOfCause;
+ continue; // do loop once again with the new cause of "current"
+ }
+ current = cause;
+ cause = current.getCause();
+ }
+
+ if (first instanceof RuntimeException) {
+ throw (RuntimeException) first;
+ } else if (first instanceof SemanticException) {
+ throw (SemanticException) first;
+ }
+ throw new RuntimeException(first);
+ }
+
+ private static class ExceptionHelper {
+ private static final Field CAUSE_FIELD = getField(Throwable.class, "cause"),
+ TARGET_FIELD = getField(InvocationTargetException.class, "target"),
+ MESSAGE_FIELD = getField(Throwable.class, "detailMessage");
+
+ private static Field getField(Class<?> clazz, String name) {
+ try {
+ Field f = clazz.getDeclaredField(name);
+ f.setAccessible(true);
+ return f;
+ } catch (Throwable t) {
+ return null;
+ }
+ }
+
+ public static boolean resetCause(Throwable target, Throwable newCause) {
+ try {
+ if (MESSAGE_FIELD == null)
+ return false;
+ Field field = (target instanceof InvocationTargetException) ? TARGET_FIELD : CAUSE_FIELD;
+ if (field == null)
+ return false;
+
+ Throwable oldCause = target.getCause();
+ String oldMsg = target.getMessage();
+ field.set(target, newCause);
+ if (oldMsg != null && oldMsg.equals(oldCause.toString())) {
+ MESSAGE_FIELD.set(target, newCause == null ? null : newCause.toString());
+ }
+ } catch (Throwable se) {
+ return false;
+ }
+ return true;
+ }
+ }
+
+ private boolean isUselessCause(Throwable t) {
+ return t instanceof RuntimeException || t instanceof InvocationTargetException
+ || t instanceof UndeclaredThrowableException;
+ }
+
+ private RowResolver genRowResolver(Operator op, QB qb) {
+ RowResolver rr = new RowResolver();
+ String subqAlias = (qb.getAliases().size() == 1 && qb.getSubqAliases().size() == 1) ? qb
+ .getAliases().get(0) : null;
+
+ for (ColumnInfo ci : op.getSchema().getSignature()) {
+ try {
+ rr.putWithCheck((subqAlias != null) ? subqAlias : ci.getTabAlias(),
+ ci.getAlias() != null ? ci.getAlias() : ci.getInternalName(), ci.getInternalName(),
+ new ColumnInfo(ci));
+ } catch (SemanticException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return rr;
+ }
+
+ private enum ExtendedCBOProfile {
+ JOIN_REORDERING,
+ WINDOWING_POSTPROCESSING;
+ }
+
+ /**
+ * Code responsible for Calcite plan generation and optimization.
+ */
+ private class CalcitePlannerAction implements Frameworks.PlannerAction<RelNode> {
+ private RelOptCluster cluster;
+ private RelOptSchema relOptSchema;
+ private final Map<String, PrunedPartitionList> partitionCache;
+ private final ColumnAccessInfo columnAccessInfo;
+ private Map<HiveProject, Table> viewProjectToTableSchema;
+
+ //correlated vars across subqueries within same query needs to have different ID
+ // this will be used in RexNodeConverter to create cor var
+ private int subqueryId;
+
+ // this is to keep track if a subquery is correlated and contains aggregate
+ // since this is special cased when it is rewritten in SubqueryRemoveRule
+ Set<RelNode> corrScalarRexSQWithAgg = new HashSet<RelNode>();
+
+ // TODO: Do we need to keep track of RR, ColNameToPosMap for every op or
+ // just last one.
+ LinkedHashMap<RelNode, RowResolver> relToHiveRR = new LinkedHashMap<RelNode, RowResolver>();
+ LinkedHashMap<RelNode, ImmutableMap<String, Integer>> relToHiveColNameCalcitePosMap = new LinkedHashMap<RelNode, ImmutableMap<String, Integer>>();
+
+ CalcitePlannerAction(Map<String, PrunedPartitionList> partitionCache, ColumnAccessInfo columnAccessInfo) {
+ this.partitionCache = partitionCache;
+ this.columnAccessInfo = columnAccessInfo;
+ }
+
+ @Override
+ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlus rootSchema) {
+ RelNode calciteGenPlan = null;
+ RelNode calcitePreCboPlan = null;
+ RelNode calciteOptimizedPlan = null;
+ subqueryId = 0;
+
+ /*
+ * recreate cluster, so that it picks up the additional traitDef
+ */
+ final Double maxSplitSize = (double) HiveConf.getLongVar(
+ conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE);
+ final Double maxMemory = (double) HiveConf.getLongVar(
+ conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
+ HiveAlgorithmsConf algorithmsConf = new HiveAlgorithmsConf(maxSplitSize, maxMemory);
+ HiveRulesRegistry registry = new HiveRulesRegistry();
+ Properties calciteConfigProperties = new Properties();
+ calciteConfigProperties.setProperty(
+ CalciteConnectionProperty.MATERIALIZATIONS_ENABLED.camelName(),
+ Boolean.FALSE.toString());
+ CalciteConnectionConfig calciteConfig = new CalciteConnectionConfigImpl(calciteConfigProperties);
+ HivePlannerContext confContext = new HivePlannerContext(algorithmsConf, registry, calciteConfig,
+ corrScalarRexSQWithAgg);
+ RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(confContext);
+ final RexBuilder rexBuilder = cluster.getRexBuilder();
+ final RelOptCluster optCluster = RelOptCluster.create(planner, rexBuilder);
+
+ this.cluster = optCluster;
+ this.relOptSchema = relOptSchema;
+
+ PerfLogger perfLogger = SessionState.getPerfLogger();
+
+ // 1. Gen Calcite Plan
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ try {
+ calciteGenPlan = genLogicalPlan(getQB(), true, null, null);
+ // if it is to create view, we do not use table alias
+ resultSchema = SemanticAnalyzer.convertRowSchemaToResultSetSchema(
+ relToHiveRR.get(calciteGenPlan),
+ getQB().isView() ? false : HiveConf.getBoolVar(conf,
+ HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
+ } catch (SemanticException e) {
+ semanticException = e;
+ throw new RuntimeException(e);
+ }
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Plan generation");
+
+ // Create executor
+ RexExecutor executorProvider = new HiveRexExecutorImpl(optCluster);
+ calciteGenPlan.getCluster().getPlanner().setExecutor(executorProvider);
+
+ // We need to get the ColumnAccessInfo and viewToTableSchema for views.
+ HiveRelFieldTrimmer fieldTrimmer = new HiveRelFieldTrimmer(null,
+ HiveRelFactories.HIVE_BUILDER.create(optCluster, null), this.columnAccessInfo,
+ this.viewProjectToTableSchema);
+
+ fieldTrimmer.trim(calciteGenPlan);
+
+ // Create and set MD provider
+ HiveDefaultRelMetadataProvider mdProvider = new HiveDefaultRelMetadataProvider(conf);
+ RelMetadataQuery.THREAD_PROVIDERS.set(
+ JaninoRelMetadataProvider.of(mdProvider.getMetadataProvider()));
+
+ //Remove subquery
+ LOG.debug("Plan before removing subquery:\n" + RelOptUtil.toString(calciteGenPlan));
+ calciteGenPlan = hepPlan(calciteGenPlan, false, mdProvider.getMetadataProvider(), null,
+ HiveSubQueryRemoveRule.REL_NODE);
+ LOG.debug("Plan just after removing subquery:\n" + RelOptUtil.toString(calciteGenPlan));
+
+ calciteGenPlan = HiveRelDecorrelator.decorrelateQuery(calciteGenPlan);
+ LOG.debug("Plan after decorrelation:\n" + RelOptUtil.toString(calciteGenPlan));
+
+ // 2. Apply pre-join order optimizations
+ calcitePreCboPlan = applyPreJoinOrderingTransforms(calciteGenPlan,
+ mdProvider.getMetadataProvider(), executorProvider);
+
+ // 3. Apply join order optimizations: reordering MST algorithm
+ // If join optimizations failed because of missing stats, we continue with
+ // the rest of optimizations
+ if (profilesCBO.contains(ExtendedCBOProfile.JOIN_REORDERING)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ try {
+ List<RelMetadataProvider> list = Lists.newArrayList();
+ list.add(mdProvider.getMetadataProvider());
+ RelTraitSet desiredTraits = optCluster
+ .traitSetOf(HiveRelNode.CONVENTION, RelCollations.EMPTY);
+
+ HepProgramBuilder hepPgmBldr = new HepProgramBuilder().addMatchOrder(HepMatchOrder.BOTTOM_UP);
+ hepPgmBldr.addRuleInstance(new JoinToMultiJoinRule(HiveJoin.class));
+ hepPgmBldr.addRuleInstance(new LoptOptimizeJoinRule(HiveRelFactories.HIVE_BUILDER));
+
+ HepProgram hepPgm = hepPgmBldr.build();
+ HepPlanner hepPlanner = new HepPlanner(hepPgm);
+
+ hepPlanner.registerMetadataProviders(list);
+ RelMetadataProvider chainedProvider = ChainedRelMetadataProvider.of(list);
+ optCluster.setMetadataProvider(new CachingRelMetadataProvider(chainedProvider, hepPlanner));
+
+ RelNode rootRel = calcitePreCboPlan;
+ hepPlanner.setRoot(rootRel);
+ if (!calcitePreCboPlan.getTraitSet().equals(desiredTraits)) {
+ rootRel = hepPlanner.changeTraits(calcitePreCboPlan, desiredTraits);
+ }
+ hepPlanner.setRoot(rootRel);
+
+ calciteOptimizedPlan = hepPlanner.findBestExp();
+ } catch (Exception e) {
+ boolean isMissingStats = noColsMissingStats.get() > 0;
+ if (isMissingStats) {
+ LOG.warn("Missing column stats (see previous messages), skipping join reordering in CBO");
+ noColsMissingStats.set(0);
+ calciteOptimizedPlan = calcitePreCboPlan;
+ disableSemJoinReordering = false;
+ } else {
+ throw e;
+ }
+ }
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Join Reordering");
+ } else {
+ calciteOptimizedPlan = calcitePreCboPlan;
+ disableSemJoinReordering = false;
+ }
+
+ // 4. Run other optimizations that do not need stats
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, ProjectRemoveRule.INSTANCE, UnionMergeRule.INSTANCE,
+ HiveProjectMergeRule.INSTANCE_NO_FORCE, HiveAggregateProjectMergeRule.INSTANCE,
+ HiveJoinCommuteRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Optimizations without stats");
+
+ // 5. Materialized view based rewriting
+ // We disable it for CTAS and MV creation queries (trying to avoid any problem
+ // due to data freshness)
+ if (conf.getBoolVar(ConfVars.HIVE_MATERIALIZED_VIEW_ENABLE_AUTO_REWRITING) &&
+ !getQB().isMaterializedView() && !getQB().isCTAS()) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ // Use Calcite cost model for view rewriting
+ RelMetadataProvider calciteMdProvider = DefaultRelMetadataProvider.INSTANCE;
+ RelMetadataQuery.THREAD_PROVIDERS.set(JaninoRelMetadataProvider.of(calciteMdProvider));
+ planner.registerMetadataProviders(Lists.newArrayList(calciteMdProvider));
+ // Add views to planner
+ List<RelOptMaterialization> materializations = new ArrayList<>();
+ try {
+ materializations = Hive.get().getRewritingMaterializedViews();
+ // We need to use the current cluster for the scan operator on views,
+ // otherwise the planner will throw an Exception (different planners)
+ materializations = Lists.transform(materializations,
+ new Function<RelOptMaterialization, RelOptMaterialization>() {
+ @Override
+ public RelOptMaterialization apply(RelOptMaterialization materialization) {
+ final RelNode viewScan = materialization.tableRel;
+ final RelNode newViewScan;
+ if (viewScan instanceof DruidQuery) {
+ final DruidQuery dq = (DruidQuery) viewScan;
+ newViewScan = DruidQuery.create(optCluster, optCluster.traitSetOf(HiveRelNode.CONVENTION),
+ viewScan.getTable(), dq.getDruidTable(),
+ ImmutableList.<RelNode>of(dq.getTableScan()));
+ } else {
+ newViewScan = new HiveTableScan(optCluster, optCluster.traitSetOf(HiveRelNode.CONVENTION),
+ (RelOptHiveTable) viewScan.getTable(), viewScan.getTable().getQualifiedName().get(0),
+ null, false, false);
+ }
+ return new RelOptMaterialization(newViewScan, materialization.queryRel, null);
+ }
+ }
+ );
+ } catch (HiveException e) {
+ LOG.warn("Exception loading materialized views", e);
+ }
+ if (!materializations.isEmpty()) {
+ for (RelOptMaterialization materialization : materializations) {
+ planner.addMaterialization(materialization);
+ }
+ // Add view-based rewriting rules to planner
+ planner.addRule(HiveMaterializedViewFilterScanRule.INSTANCE);
+ // Optimize plan
+ planner.setRoot(calciteOptimizedPlan);
+ calciteOptimizedPlan = planner.findBestExp();
+ // Remove view-based rewriting rules from planner
+ planner.clear();
+ }
+ // Restore default cost model
+ RelMetadataQuery.THREAD_PROVIDERS.set(JaninoRelMetadataProvider.of(mdProvider.getMetadataProvider()));
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: View-based rewriting");
+ }
+
+ // 6. Run aggregate-join transpose (cost based)
+ // If it failed because of missing stats, we continue with
+ // the rest of optimizations
+ if (conf.getBoolVar(ConfVars.AGGR_JOIN_TRANSPOSE)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ try {
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, HiveAggregateJoinTransposeRule.INSTANCE);
+ } catch (Exception e) {
+ boolean isMissingStats = noColsMissingStats.get() > 0;
+ if (isMissingStats) {
+ LOG.warn("Missing column stats (see previous messages), skipping aggregate-join transpose in CBO");
+ noColsMissingStats.set(0);
+ } else {
+ throw e;
+ }
+ }
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Aggregate join transpose");
+ }
+
+ // 7.convert Join + GBy to semijoin
+ // run this rule at later stages, since many calcite rules cant deal with semijoin
+ if (conf.getBoolVar(ConfVars.SEMIJOIN_CONVERSION)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null, HiveSemiJoinRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Semijoin conversion");
+ }
+
+
+ // 8. Run rule to fix windowing issue when it is done over
+ // aggregation columns (HIVE-10627)
+ if (profilesCBO.contains(ExtendedCBOProfile.WINDOWING_POSTPROCESSING)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, HiveWindowingFixRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Window fixing rule");
+ }
+
+ // 9. Apply Druid transformation rules
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, DruidRules.FILTER, DruidRules.AGGREGATE_PROJECT,
+ DruidRules.PROJECT, DruidRules.AGGREGATE, DruidRules.SORT_PROJECT_TRANSPOSE,
+ DruidRules.SORT, DruidRules.PROJECT_SORT_TRANSPOSE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Druid transformation rules");
+
+ // 10. Run rules to aid in translation from Calcite tree to Hive tree
+ if (HiveConf.getBoolVar(conf, ConfVars.HIVE_CBO_RETPATH_HIVEOP)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ // 10.1. Merge join into multijoin operators (if possible)
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, true, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, HiveJoinProjectTransposeRule.BOTH_PROJECT_INCLUDE_OUTER,
+ HiveJoinProjectTransposeRule.LEFT_PROJECT_INCLUDE_OUTER,
+ HiveJoinProjectTransposeRule.RIGHT_PROJECT_INCLUDE_OUTER,
+ HiveJoinToMultiJoinRule.INSTANCE, HiveProjectMergeRule.INSTANCE);
+ // The previous rules can pull up projections through join operators,
+ // thus we run the field trimmer again to push them back down
+ fieldTrimmer = new HiveRelFieldTrimmer(null,
+ HiveRelFactories.HIVE_BUILDER.create(optCluster, null));
+ calciteOptimizedPlan = fieldTrimmer.trim(calciteOptimizedPlan);
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, ProjectRemoveRule.INSTANCE,
+ new ProjectMergeRule(false, HiveRelFactories.HIVE_BUILDER));
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, true, mdProvider.getMetadataProvider(), null,
+ HiveFilterProjectTSTransposeRule.INSTANCE, HiveFilterProjectTSTransposeRule.INSTANCE_DRUID,
+ HiveProjectFilterPullUpConstantsRule.INSTANCE);
+
+ // 10.2. Introduce exchange operators below join/multijoin operators
+ calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+ HepMatchOrder.BOTTOM_UP, HiveInsertExchange4JoinRule.EXCHANGE_BELOW_JOIN,
+ HiveInsertExchange4JoinRule.EXCHANGE_BELOW_MULTIJOIN);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Translation from Calcite tree to Hive tree");
+ }
+
+ if (LOG.isDebugEnabled() && !conf.getBoolVar(ConfVars.HIVE_IN_TEST)) {
+ LOG.debug("CBO Planning details:\n");
+ LOG.debug("Original Plan:\n" + RelOptUtil.toString(calciteGenPlan));
+ LOG.debug("Plan After PPD, PartPruning, ColumnPruning:\n"
+ + RelOptUtil.toString(calcitePreCboPlan));
+ LOG.debug("Plan After Join Reordering:\n"
+ + RelOptUtil.toString(calciteOptimizedPlan, SqlExplainLevel.ALL_ATTRIBUTES));
+ }
+
+ return calciteOptimizedPlan;
+ }
+
+ /**
+ * Perform all optimizations before Join Ordering.
+ *
+ * @param basePlan
+ * original plan
+ * @param mdProvider
+ * meta data provider
+ * @param executorProvider
+ * executor
+ * @return
+ */
+ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProvider mdProvider, RexExecutor executorProvider) {
+ // TODO: Decorelation of subquery should be done before attempting
+ // Partition Pruning; otherwise Expression evaluation may try to execute
+ // corelated sub query.
+
+ PerfLogger perfLogger = SessionState.getPerfLogger();
+
+ final int maxCNFNodeCount = conf.getIntVar(HiveConf.ConfVars.HIVE_CBO_CNF_NODES_LIMIT);
+ final int minNumORClauses = conf.getIntVar(HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZERMIN);
+
+ //0. SetOp rewrite
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, true, mdProvider, null, HepMatchOrder.BOTTOM_UP,
+ HiveProjectOverIntersectRemoveRule.INSTANCE, HiveIntersectMergeRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: HiveProjectOverIntersectRemoveRule and HiveIntersectMerge rules");
+
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+ HiveIntersectRewriteRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: HiveIntersectRewrite rule");
+
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+ HiveExceptRewriteRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: HiveExceptRewrite rule");
+
+ //1. Distinct aggregate rewrite
+ // Run this optimization early, since it is expanding the operator pipeline.
+ if (!conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr") &&
+ conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEDISTINCTREWRITE)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ // Its not clear, if this rewrite is always performant on MR, since extra map phase
+ // introduced for 2nd MR job may offset gains of this multi-stage aggregation.
+ // We need a cost model for MR to enable this on MR.
+ basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HiveExpandDistinctAggregatesRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Distinct aggregate rewrite");
+ }
+
+ // 2. Try factoring out common filter elements & separating deterministic
+ // vs non-deterministic UDF. This needs to run before PPD so that PPD can
+ // add on-clauses for old style Join Syntax
+ // Ex: select * from R1 join R2 where ((R1.x=R2.x) and R1.y<10) or
+ // ((R1.x=R2.x) and R1.z=10)) and rand(1) < 0.1
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, HepMatchOrder.ARBITRARY,
+ new HivePreFilteringRule(maxCNFNodeCount));
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, factor out common filter elements and separating deterministic vs non-deterministic UDF");
+
+ // 3. Run exhaustive PPD, add not null filters, transitive inference,
+ // constant propagation, constant folding
+ List<RelOptRule> rules = Lists.newArrayList();
+ if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTPPD_WINDOWING)) {
+ rules.add(HiveFilterProjectTransposeRule.INSTANCE_DETERMINISTIC_WINDOWING);
+ } else {
+ rules.add(HiveFilterProjectTransposeRule.INSTANCE_DETERMINISTIC);
+ }
+ rules.add(HiveFilterSetOpTransposeRule.INSTANCE);
+ rules.add(HiveFilterSortTransposeRule.INSTANCE);
+ rules.add(HiveFilterJoinRule.JOIN);
+ rules.add(HiveFilterJoinRule.FILTER_ON_JOIN);
+ rules.add(new HiveFilterAggregateTransposeRule(Filter.class, HiveRelFactories.HIVE_FILTER_FACTORY, Aggregate.class));
+ rules.add(new FilterMergeRule(HiveRelFactories.HIVE_BUILDER));
+ if (conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_REDUCE_WITH_STATS)) {
+ rules.add(HiveReduceExpressionsWithStatsRule.INSTANCE);
+ }
+ rules.add(HiveProjectFilterPullUpConstantsRule.INSTANCE);
+ rules.add(HiveReduceExpressionsRule.PROJECT_INSTANCE);
+ rules.add(HiveReduceExpressionsRule.FILTER_INSTANCE);
+ rules.add(HiveReduceExpressionsRule.JOIN_INSTANCE);
+ if (conf.getBoolVar(HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZER)) {
+ rules.add(new HivePointLookupOptimizerRule.FilterCondition(minNumORClauses));
+ rules.add(new HivePointLookupOptimizerRule.JoinCondition(minNumORClauses));
+ }
+ rules.add(HiveJoinAddNotNullRule.INSTANCE_JOIN);
+ rules.add(HiveJoinAddNotNullRule.INSTANCE_SEMIJOIN);
+ rules.add(HiveJoinPushTransitivePredicatesRule.INSTANCE_JOIN);
+ rules.add(HiveJoinPushTransitivePredicatesRule.INSTANCE_SEMIJOIN);
+ rules.add(HiveSortMergeRule.INSTANCE);
+ rules.add(HiveSortLimitPullUpConstantsRule.INSTANCE);
+ rules.add(HiveUnionPullUpConstantsRule.INSTANCE);
+ rules.add(HiveAggregatePullUpConstantsRule.INSTANCE);
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+ rules.toArray(new RelOptRule[rules.size()]));
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, PPD, not null predicates, transitive inference, constant folding");
+
+ // 4. Push down limit through outer join
+ // NOTE: We run this after PPD to support old style join syntax.
+ // Ex: select * from R1 left outer join R2 where ((R1.x=R2.x) and R1.y<10) or
+ // ((R1.x=R2.x) and R1.z=10)) and rand(1) < 0.1 order by R1.x limit 10
+ if (conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_TRANSPOSE)) {
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ // This should be a cost based decision, but till we enable the extended cost
+ // model, we will use the given value for the variable
+ final float reductionProportion = HiveConf.getFloatVar(conf,
+ HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_TRANSPOSE_REDUCTION_PERCENTAGE);
+ final long reductionTuples = HiveConf.getLongVar(conf,
+ HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_TRANSPOSE_REDUCTION_TUPLES);
+ basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HiveSortMergeRule.INSTANCE,
+ HiveSortProjectTransposeRule.INSTANCE, HiveSortJoinReduceRule.INSTANCE,
+ HiveSortUnionReduceRule.INSTANCE);
+ basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+ new HiveSortRemoveRule(reductionProportion, reductionTuples),
+ HiveProjectSortTransposeRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Push down limit through outer join");
+ }
+
+ // 5. Push Down Semi Joins
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, SemiJoinJoinTransposeRule.INSTANCE,
+ SemiJoinFilterTransposeRule.INSTANCE, SemiJoinProjectTransposeRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Push Down Semi Joins");
+
+ // 6. Apply Partition Pruning
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, new HivePartitionPruneRule(conf));
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Partition Pruning");
+
+ // 7. Projection Pruning (this introduces select above TS & hence needs to be run last due to PP)
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ HiveRelFieldTrimmer fieldTrimmer = new HiveRelFieldTrimmer(null,
+ HiveRelFactories.HIVE_BUILDER.create(cluster, null),
+ profilesCBO.contains(ExtendedCBOProfile.JOIN_REORDERING));
+ basePlan = fieldTrimmer.trim(basePlan);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Projection Pruning");
+
+ // 8. Merge, remove and reduce Project if possible
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, false, mdProvider, executorProvider,
+ HiveProjectMergeRule.INSTANCE, ProjectRemoveRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Merge Project-Project");
+
+ // 9. Rerun PPD through Project as column pruning would have introduced
+ // DT above scans; By pushing filter just above TS, Hive can push it into
+ // storage (incase there are filters on non partition cols). This only
+ // matches FIL-PROJ-TS
+ perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+ basePlan = hepPlan(basePlan, true, mdProvider, executorProvider,
+ HiveFilterProjectTSTransposeRule.INSTANCE, HiveFilterProjectTSTransposeRule.INSTANCE_DRUID,
+ HiveProjectFilterPullUpConstantsRule.INSTANCE);
+ perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+ "Calcite: Prejoin ordering transformation, Rerun PPD");
+
+ return basePlan;
+ }
+
+ /**
+ * Run the HEP Planner with the given rule set.
+ *
+ * @param basePlan
+ * @param followPlanChanges
+ * @param mdProvider
+ * @param executorProvider
+ * @param rules
+ * @return optimized RelNode
+ */
+ private RelNode hepPlan(RelNode basePlan, boolean followPlanChanges,
+ RelMetadataProvider mdProvider, RexExecutor executorProvider, RelOptRule... rules) {
+ return hepPlan(basePlan, followPlanChanges, mdProvider, executorProvider,
+ HepMatchOrder.TOP_DOWN, rules);
+ }
+
+ /**
+ * Run the HEP Planner with the given rule set.
+ *
+ * @param basePlan
+ * @param followPlanChanges
+ * @param mdProvider
+ * @param executorProvider
+ * @param order
+ * @param rules
+ * @return optimized RelNode
+ */
+ private RelNode hepPlan(RelNode basePlan, boolean followPlanChanges,
+ RelMetadataProvider mdProvider, RexExecutor executorProvider, HepMatchOrder order,
+ RelOptRule... rules) {
+
+ RelNode optimizedRelNode = basePlan;
+ HepProgramBuilder programBuilder = new HepProgramBuilder();
+ if (followPlanChanges) {
+ programBuilder.addMatchOrder(order);
+ programBuilder = programBuilder.addRuleCollection(ImmutableList.copyOf(rules));
+ } else {
+ // TODO: Should this be also TOP_DOWN?
+ for (RelOptRule r : rules)
+ programBuilder.addRuleInstance(r);
+ }
+
+ // Create planner and copy context
+ HepPlanner planner = new HepPlanner(programBuilder.build(),
+ basePlan.getCluster().getPlanner().getContext());
+
+ List<RelMetadataProvider> list = Lists.newArrayList();
+ list.add(mdProvider);
+ planner.registerMetadataProviders(list);
+ RelMetadataProvider chainedProvider = ChainedRelMetadataProvider.of(list);
+ basePlan.getCluster().setMetadataProvider(
+ new CachingRelMetadataProvider(chainedProvider, planner));
+
+ if (executorProvider != null) {
+ // basePlan.getCluster.getPlanner is the VolcanoPlanner from apply()
+ // both planners need to use the correct executor
+ basePlan.getCluster().getPlanner().setExecutor(executorProvider);
+ planner.setExecutor(executorProvider);
+ }
+
+ planner.setRoot(basePlan);
+ optimizedRelNode = planner.findBestExp();
+
+ return optimizedRelNode;
+ }
+
+ @SuppressWarnings("nls")
+ private RelNode genSetOpLogicalPlan(Opcode opcode, String alias, String leftalias, RelNode leftRel,
+ String rightalias, RelNode rightRel) throws SemanticException {
+ // 1. Get Row Resolvers, Column map for original left and right input of
+ // SetOp Rel
+ RowResolver leftRR = this.relToHiveRR.get(leftRel);
+ RowResolver rightRR = this.relToHiveRR.get(rightRel);
+ HashMap<String, ColumnInfo> leftmap = leftRR.getFieldMap(leftalias);
+ HashMap<String, ColumnInfo> rightmap = rightRR.getFieldMap(rightalias);
+
+ // 2. Validate that SetOp is feasible according to Hive (by using type
+ // info from RR)
+ if (leftmap.size() != rightmap.size()) {
+ throw new SemanticException("Schema of both sides of union should match.");
+ }
+
+ ASTNode tabref = getQB().getAliases().isEmpty() ? null : getQB().getParseInfo()
+ .getSrcForAlias(getQB().getAliases().get(0));
+
+ // 3. construct SetOp Output RR using original left & right Input
+ RowResolver setOpOutRR = new RowResolver();
+
+ Iterator<Map.Entry<String, ColumnInfo>> lIter = leftmap.entrySet().iterator();
+ Iterator<Map.Entry<String, ColumnInfo>> rIter = rightmap.entrySet().iterator();
+ while (lIter.hasNext()) {
+ Map.Entry<String, ColumnInfo> lEntry = lIter.next();
+ Map.Entry<String, ColumnInfo> rEntry = rIter.next();
+ ColumnInfo lInfo = lEntry.getValue();
+ ColumnInfo rInfo = rEntry.getValue();
+
+ String field = lEntry.getKey();
+ // try widening conversion, otherwise fail union
+ TypeInfo commonTypeInfo = FunctionRegistry.getCommonClassForUnionAll(lInfo.getType(),
+ rInfo.getType());
+ if (commonTypeInfo == null) {
+ throw new SemanticException(generateErrorMessage(tabref,
+ "Schema of both sides of setop should match: Column " + field
+ + " is of type " + lInfo.getType().getTypeName()
+ + " on first table and type " + rInfo.getType().getTypeName()
+ + " on second table"));
+ }
+ ColumnInfo setOpColInfo = new ColumnInfo(lInfo);
+ setOpColInfo.setType(commonTypeInfo);
+ setOpOutRR.put(alias, field, setOpColInfo);
+ }
+
+ // 4. Determine which columns requires cast on left/right input (Calcite
+ // requires exact types on both sides of SetOp)
+ boolean leftNeedsTypeCast = false;
+ boolean rightNeedsTypeCast = false;
+ List<RexNode> leftProjs = new ArrayList<RexNode>();
+ List<RexNode> rightProjs = new ArrayList<RexNode>();
+ List<RelDataTypeField> leftRowDT = leftRel.getRowType().getFieldList();
+ List<RelDataTypeField> rightRowDT = rightRel.getRowType().getFieldList();
+
+ RelDataType leftFieldDT;
+ RelDataType rightFieldDT;
+ RelDataType unionFieldDT;
+ for (int i = 0; i < leftRowDT.size(); i++) {
+ leftFieldDT = leftRowDT.get(i).getType();
+ rightFieldDT = rightRowDT.get(i).getType();
+ if (!leftFieldDT.equals(rightFieldDT)) {
+ unionFieldDT = TypeConverter.convert(setOpOutRR.getColumnInfos().get(i).getType(),
+ cluster.getTypeFactory());
+ if (!unionFieldDT.equals(leftFieldDT)) {
+ leftNeedsTypeCast = true;
+ }
+ leftProjs.add(cluster.getRexBuilder().ensureType(unionFieldDT,
+ cluster.getRexBuilder().makeInputRef(leftFieldDT, i), true));
+
+ if (!unionFieldDT.equals(rightFieldDT)) {
+ rightNeedsTypeCast = true;
+ }
+ rightProjs.add(cluster.getRexBuilder().ensureType(unionFieldDT,
+ cluster.getRexBuilder().makeInputRef(rightFieldDT, i), true));
+ } else {
+ leftProjs.add(cluster.getRexBuilder().ensureType(leftFieldDT,
+ cluster.getRexBuilder().makeInputRef(leftFieldDT, i), true));
+ rightProjs.add(cluster.getRexBuilder().ensureType(rightFieldDT,
+ cluster.getRexBuilder().makeInputRef(rightFieldDT, i), true));
+ }
+ }
+
+ // 5. Introduce Project Rel above original left/right inputs if cast is
+ // needed for type parity
+ RelNode setOpLeftInput = leftRel;
+ RelNode setOpRightInput = rightRel;
+ if (leftNeedsTypeCast) {
+ setOpLeftInput = HiveProject.create(leftRel, leftProjs, leftRel.getRowType()
+ .getFieldNames());
+ }
+ if (rightNeedsTypeCast) {
+ setOpRightInput = HiveProject.create(rightRel, rightProjs, rightRel.getRowType()
+ .getFieldNames());
+ }
+
+ // 6. Construct SetOp Rel
+ Builder<RelNode> bldr = new ImmutableList.Builder<RelNode>();
+ bldr.add(setOpLeftInput);
+ bldr.add(setOpRightInput);
+ SetOp setOpRel = null;
+ switch (opcode) {
+ case UNION:
+ setOpRel = new HiveUnion(cluster, TraitsUtil.getDefaultTraitSet(cluster), bldr.build());
+ break;
+ case INTERSECT:
+ setOpRel = new HiveIntersect(cluster, TraitsUtil.getDefaultTraitSet(cluster), bldr.build(),
+ false);
+
<TRUNCATED>
[2/5] hive git commit: HIVE-16423: Add hint to enforce semi join
optimization (Deepak Jaiswal, reviewed by Jason Dere)
Posted by gu...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig
new file mode 100644
index 0000000..b5a5645
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig
@@ -0,0 +1,13508 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.security.AccessControlException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Queue;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.UUID;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.antlr.runtime.ClassicToken;
+import org.antlr.runtime.CommonToken;
+import org.antlr.runtime.Token;
+import org.antlr.runtime.tree.Tree;
+import org.antlr.runtime.tree.TreeVisitor;
+import org.antlr.runtime.tree.TreeVisitorAction;
+import org.antlr.runtime.tree.TreeWizard;
+import org.antlr.runtime.tree.TreeWizard.ContextVisitor;
+import org.apache.calcite.rel.RelNode;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.common.StatsSetupConst;
+import org.apache.hadoop.hive.common.StatsSetupConst.StatDB;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.conf.HiveConf.StrictChecks;
+import org.apache.hadoop.hive.metastore.MetaStoreUtils;
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.Warehouse;
+import org.apache.hadoop.hive.metastore.api.Database;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Order;
+import org.apache.hadoop.hive.metastore.api.SQLForeignKey;
+import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey;
+import org.apache.hadoop.hive.ql.CompilationOpContext;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.QueryProperties;
+import org.apache.hadoop.hive.ql.QueryState;
+import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.ArchiveUtils;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
+import org.apache.hadoop.hive.ql.exec.FetchTask;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.FunctionInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.RecordReader;
+import org.apache.hadoop.hive.ql.exec.RecordWriter;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.exec.UnionOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.hooks.ReadEntity;
+import org.apache.hadoop.hive.ql.hooks.WriteEntity;
+import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
+import org.apache.hadoop.hive.ql.io.AcidUtils.Operation;
+import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
+import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
+import org.apache.hadoop.hive.ql.io.NullRowsInputFormat;
+import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.metadata.DummyPartition;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.metadata.InvalidTableException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
+import org.apache.hadoop.hive.ql.optimizer.Optimizer;
+import org.apache.hadoop.hive.ql.optimizer.Transform;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverterPostProc;
+import org.apache.hadoop.hive.ql.optimizer.lineage.Generator;
+import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec.SpecType;
+import org.apache.hadoop.hive.ql.parse.CalcitePlanner.ASTSearcher;
+import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFInputSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFQueryInputSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFQueryInputType;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionedTableFunctionSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitioningSpec;
+import org.apache.hadoop.hive.ql.parse.QBSubQuery.SubQueryType;
+import org.apache.hadoop.hive.ql.parse.SubQueryUtils.ISubQueryJoinInfo;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.Direction;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFrameSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowType;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.AlterTableDesc;
+import org.apache.hadoop.hive.ql.plan.AlterTableDesc.AlterTableTypes;
+import org.apache.hadoop.hive.ql.plan.CreateTableDesc;
+import org.apache.hadoop.hive.ql.plan.CreateTableLikeDesc;
+import org.apache.hadoop.hive.ql.plan.CreateViewDesc;
+import org.apache.hadoop.hive.ql.plan.DDLWork;
+import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
+import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc;
+import org.apache.hadoop.hive.ql.plan.ForwardDesc;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.HiveOperation;
+import org.apache.hadoop.hive.ql.plan.InsertTableDesc;
+import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
+import org.apache.hadoop.hive.ql.plan.JoinDesc;
+import org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc;
+import org.apache.hadoop.hive.ql.plan.LateralViewJoinDesc;
+import org.apache.hadoop.hive.ql.plan.LimitDesc;
+import org.apache.hadoop.hive.ql.plan.ListBucketingCtx;
+import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
+import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.PTFDesc;
+import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.ScriptDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.plan.UDTFDesc;
+import org.apache.hadoop.hive.ql.plan.UnionDesc;
+import org.apache.hadoop.hive.ql.plan.ptf.OrderExpressionDef;
+import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef;
+import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef;
+import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObject;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.session.SessionState.ResourceType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
+import org.apache.hadoop.hive.ql.util.ResourceDownloader;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.Deserializer;
+import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe;
+import org.apache.hadoop.hive.serde2.NoOpFetchFormatter;
+import org.apache.hadoop.hive.serde2.NullStructSerDe;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
+import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.thrift.ThriftJDBCBinarySerDe;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.hive.shims.HadoopShims;
+import org.apache.hadoop.hive.shims.Utils;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.InputFormat;
+import org.apache.hadoop.mapred.OutputFormat;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import com.google.common.base.Splitter;
+import com.google.common.base.Strings;
+import com.google.common.collect.Sets;
+import com.google.common.math.IntMath;
+
+/**
+ * Implementation of the semantic analyzer. It generates the query plan.
+ * There are other specific semantic analyzers for some hive operations such as
+ * DDLSemanticAnalyzer for ddl operations.
+ */
+
+public class SemanticAnalyzer extends BaseSemanticAnalyzer {
+
+ public static final String DUMMY_DATABASE = "_dummy_database";
+ public static final String DUMMY_TABLE = "_dummy_table";
+ public static final String SUBQUERY_TAG_1 = "-subquery1";
+ public static final String SUBQUERY_TAG_2 = "-subquery2";
+
+ // Max characters when auto generating the column name with func name
+ private static final int AUTOGEN_COLALIAS_PRFX_MAXLENGTH = 20;
+
+ public static final String VALUES_TMP_TABLE_NAME_PREFIX = "Values__Tmp__Table__";
+
+ static final String MATERIALIZATION_MARKER = "$MATERIALIZATION";
+
+ private HashMap<TableScanOperator, ExprNodeDesc> opToPartPruner;
+ private HashMap<TableScanOperator, PrunedPartitionList> opToPartList;
+ protected HashMap<String, TableScanOperator> topOps;
+ protected LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtx;
+ private List<LoadTableDesc> loadTableWork;
+ private List<LoadFileDesc> loadFileWork;
+ private List<ColumnStatsAutoGatherContext> columnStatsAutoGatherContexts;
+ private final Map<JoinOperator, QBJoinTree> joinContext;
+ private final Map<SMBMapJoinOperator, QBJoinTree> smbMapJoinContext;
+ private final HashMap<TableScanOperator, Table> topToTable;
+ private final Map<FileSinkOperator, Table> fsopToTable;
+ private final List<ReduceSinkOperator> reduceSinkOperatorsAddedByEnforceBucketingSorting;
+ private final HashMap<TableScanOperator, Map<String, String>> topToTableProps;
+ private QB qb;
+ private ASTNode ast;
+ private int destTableId;
+ private UnionProcContext uCtx;
+ List<AbstractMapJoinOperator<? extends MapJoinDesc>> listMapJoinOpsNoReducer;
+ private HashMap<TableScanOperator, SampleDesc> opToSamplePruner;
+ private final Map<TableScanOperator, Map<String, ExprNodeDesc>> opToPartToSkewedPruner;
+ private Map<SelectOperator, Table> viewProjectToTableSchema;
+ /**
+ * a map for the split sampling, from alias to an instance of SplitSample
+ * that describes percentage and number.
+ */
+ private final HashMap<String, SplitSample> nameToSplitSample;
+ Map<GroupByOperator, Set<String>> groupOpToInputTables;
+ Map<String, PrunedPartitionList> prunedPartitions;
+ protected List<FieldSchema> resultSchema;
+ protected CreateViewDesc createVwDesc;
+ protected ArrayList<String> viewsExpanded;
+ protected ASTNode viewSelect;
+ protected final UnparseTranslator unparseTranslator;
+ private final GlobalLimitCtx globalLimitCtx;
+
+ // prefix for column names auto generated by hive
+ private final String autogenColAliasPrfxLbl;
+ private final boolean autogenColAliasPrfxIncludeFuncName;
+
+ // Keep track of view alias to read entity corresponding to the view
+ // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T
+ // keeps track of aliases for V3, V3:V2, V3:V2:V1.
+ // This is used when T is added as an input for the query, the parents of T is
+ // derived from the alias V3:V2:V1:T
+ private final Map<String, ReadEntity> viewAliasToInput;
+
+ //need merge isDirect flag to input even if the newInput does not have a parent
+ private boolean mergeIsDirect;
+
+ // flag for no scan during analyze ... compute statistics
+ protected boolean noscan;
+
+ //flag for partial scan during analyze ... compute statistics
+ protected boolean partialscan;
+
+ protected volatile boolean disableJoinMerge = false;
+ protected final boolean defaultJoinMerge;
+
+ /*
+ * Capture the CTE definitions in a Query.
+ */
+ final Map<String, CTEClause> aliasToCTEs;
+
+ /*
+ * Used to check recursive CTE invocations. Similar to viewsExpanded
+ */
+ ArrayList<String> ctesExpanded;
+
+ /*
+ * Whether root tasks after materialized CTE linkage have been resolved
+ */
+ boolean rootTasksResolved;
+
+ protected TableMask tableMask;
+
+ CreateTableDesc tableDesc;
+
+ /** Not thread-safe. */
+ final ASTSearcher astSearcher = new ASTSearcher();
+
+ protected AnalyzeRewriteContext analyzeRewrite;
+
+ // A mapping from a tableName to a table object in metastore.
+ Map<String, Table> tabNameToTabObject;
+
+ // The tokens we should ignore when we are trying to do table masking.
+ private final Set<Integer> ignoredTokens = Sets.newHashSet(HiveParser.TOK_GROUPBY,
+ HiveParser.TOK_ORDERBY, HiveParser.TOK_WINDOWSPEC, HiveParser.TOK_CLUSTERBY,
+ HiveParser.TOK_DISTRIBUTEBY, HiveParser.TOK_SORTBY);
+
+ static class Phase1Ctx {
+ String dest;
+ int nextNum;
+ }
+
+ public SemanticAnalyzer(QueryState queryState) throws SemanticException {
+ super(queryState);
+ opToPartPruner = new HashMap<TableScanOperator, ExprNodeDesc>();
+ opToPartList = new HashMap<TableScanOperator, PrunedPartitionList>();
+ opToSamplePruner = new HashMap<TableScanOperator, SampleDesc>();
+ nameToSplitSample = new HashMap<String, SplitSample>();
+ // Must be deterministic order maps - see HIVE-8707
+ topOps = new LinkedHashMap<String, TableScanOperator>();
+ loadTableWork = new ArrayList<LoadTableDesc>();
+ loadFileWork = new ArrayList<LoadFileDesc>();
+ columnStatsAutoGatherContexts = new ArrayList<ColumnStatsAutoGatherContext>();
+ opParseCtx = new LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext>();
+ joinContext = new HashMap<JoinOperator, QBJoinTree>();
+ smbMapJoinContext = new HashMap<SMBMapJoinOperator, QBJoinTree>();
+ // Must be deterministic order map for consistent q-test output across Java versions
+ topToTable = new LinkedHashMap<TableScanOperator, Table>();
+ fsopToTable = new HashMap<FileSinkOperator, Table>();
+ reduceSinkOperatorsAddedByEnforceBucketingSorting = new ArrayList<ReduceSinkOperator>();
+ topToTableProps = new HashMap<TableScanOperator, Map<String, String>>();
+ destTableId = 1;
+ uCtx = null;
+ listMapJoinOpsNoReducer = new ArrayList<AbstractMapJoinOperator<? extends MapJoinDesc>>();
+ groupOpToInputTables = new HashMap<GroupByOperator, Set<String>>();
+ prunedPartitions = new HashMap<String, PrunedPartitionList>();
+ tabNameToTabObject = new HashMap<String, Table>();
+ unparseTranslator = new UnparseTranslator(conf);
+ autogenColAliasPrfxLbl = HiveConf.getVar(conf,
+ HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_LABEL);
+ autogenColAliasPrfxIncludeFuncName = HiveConf.getBoolVar(conf,
+ HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_INCLUDEFUNCNAME);
+ queryProperties = new QueryProperties();
+ opToPartToSkewedPruner = new HashMap<TableScanOperator, Map<String, ExprNodeDesc>>();
+ aliasToCTEs = new HashMap<String, CTEClause>();
+ globalLimitCtx = new GlobalLimitCtx();
+ viewAliasToInput = new HashMap<String, ReadEntity>();
+ mergeIsDirect = true;
+ noscan = partialscan = false;
+ tabNameToTabObject = new HashMap<>();
+ defaultJoinMerge = false == HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_MERGE_NWAY_JOINS);
+ disableJoinMerge = defaultJoinMerge;
+ }
+
+ @Override
+ protected void reset(boolean clearPartsCache) {
+ super.reset(true);
+ if(clearPartsCache) {
+ prunedPartitions.clear();
+
+ //When init(true) combine with genResolvedParseTree, it will generate Resolved Parse tree from syntax tree
+ //ReadEntity created under these conditions should be all relevant to the syntax tree even the ones without parents
+ //set mergeIsDirect to true here.
+ mergeIsDirect = true;
+ } else {
+ mergeIsDirect = false;
+ }
+ tabNameToTabObject.clear();
+ loadTableWork.clear();
+ loadFileWork.clear();
+ columnStatsAutoGatherContexts.clear();
+ topOps.clear();
+ destTableId = 1;
+ idToTableNameMap.clear();
+ qb = null;
+ ast = null;
+ uCtx = null;
+ joinContext.clear();
+ smbMapJoinContext.clear();
+ opParseCtx.clear();
+ groupOpToInputTables.clear();
+ disableJoinMerge = defaultJoinMerge;
+ aliasToCTEs.clear();
+ topToTable.clear();
+ opToPartPruner.clear();
+ opToPartList.clear();
+ opToPartToSkewedPruner.clear();
+ opToSamplePruner.clear();
+ nameToSplitSample.clear();
+ fsopToTable.clear();
+ resultSchema = null;
+ createVwDesc = null;
+ viewsExpanded = null;
+ viewSelect = null;
+ ctesExpanded = null;
+ globalLimitCtx.disableOpt();
+ viewAliasToInput.clear();
+ reduceSinkOperatorsAddedByEnforceBucketingSorting.clear();
+ topToTableProps.clear();
+ listMapJoinOpsNoReducer.clear();
+ unparseTranslator.clear();
+ queryProperties.clear();
+ outputs.clear();
+ }
+
+ public void initParseCtx(ParseContext pctx) {
+ opToPartPruner = pctx.getOpToPartPruner();
+ opToPartList = pctx.getOpToPartList();
+ opToSamplePruner = pctx.getOpToSamplePruner();
+ topOps = pctx.getTopOps();
+ loadTableWork = pctx.getLoadTableWork();
+ loadFileWork = pctx.getLoadFileWork();
+ ctx = pctx.getContext();
+ destTableId = pctx.getDestTableId();
+ idToTableNameMap = pctx.getIdToTableNameMap();
+ uCtx = pctx.getUCtx();
+ listMapJoinOpsNoReducer = pctx.getListMapJoinOpsNoReducer();
+ prunedPartitions = pctx.getPrunedPartitions();
+ tabNameToTabObject = pctx.getTabNameToTabObject();
+ fetchTask = pctx.getFetchTask();
+ setLineageInfo(pctx.getLineageInfo());
+ }
+
+ public ParseContext getParseContext() {
+ // Make sure the basic query properties are initialized
+ copyInfoToQueryProperties(queryProperties);
+ return new ParseContext(queryState, opToPartPruner, opToPartList, topOps,
+ new HashSet<JoinOperator>(joinContext.keySet()),
+ new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()),
+ loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx,
+ listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject,
+ opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks,
+ opToPartToSkewedPruner, viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting,
+ analyzeRewrite, tableDesc, createVwDesc, queryProperties, viewProjectToTableSchema, acidFileSinks);
+ }
+
+ public CompilationOpContext getOpContext() {
+ return ctx.getOpContext();
+ }
+
+ public void doPhase1QBExpr(ASTNode ast, QBExpr qbexpr, String id, String alias)
+ throws SemanticException {
+ doPhase1QBExpr(ast, qbexpr, id, alias, false);
+ }
+ @SuppressWarnings("nls")
+ public void doPhase1QBExpr(ASTNode ast, QBExpr qbexpr, String id, String alias, boolean insideView)
+ throws SemanticException {
+
+ assert (ast.getToken() != null);
+ if (ast.getToken().getType() == HiveParser.TOK_QUERY) {
+ QB qb = new QB(id, alias, true);
+ qb.setInsideView(insideView);
+ Phase1Ctx ctx_1 = initPhase1Ctx();
+ doPhase1(ast, qb, ctx_1, null);
+
+ qbexpr.setOpcode(QBExpr.Opcode.NULLOP);
+ qbexpr.setQB(qb);
+ }
+ // setop
+ else {
+ switch (ast.getToken().getType()) {
+ case HiveParser.TOK_UNIONALL:
+ qbexpr.setOpcode(QBExpr.Opcode.UNION);
+ break;
+ case HiveParser.TOK_INTERSECTALL:
+ qbexpr.setOpcode(QBExpr.Opcode.INTERSECTALL);
+ break;
+ case HiveParser.TOK_INTERSECTDISTINCT:
+ qbexpr.setOpcode(QBExpr.Opcode.INTERSECT);
+ break;
+ case HiveParser.TOK_EXCEPTALL:
+ qbexpr.setOpcode(QBExpr.Opcode.EXCEPTALL);
+ break;
+ case HiveParser.TOK_EXCEPTDISTINCT:
+ qbexpr.setOpcode(QBExpr.Opcode.EXCEPT);
+ break;
+ default:
+ throw new SemanticException(ErrorMsg.UNSUPPORTED_SET_OPERATOR.getMsg("Type "
+ + ast.getToken().getType()));
+ }
+ // query 1
+ assert (ast.getChild(0) != null);
+ QBExpr qbexpr1 = new QBExpr(alias + SUBQUERY_TAG_1);
+ doPhase1QBExpr((ASTNode) ast.getChild(0), qbexpr1, id + SUBQUERY_TAG_1, alias
+ + SUBQUERY_TAG_1, insideView);
+ qbexpr.setQBExpr1(qbexpr1);
+
+ // query 2
+ assert (ast.getChild(1) != null);
+ QBExpr qbexpr2 = new QBExpr(alias + SUBQUERY_TAG_2);
+ doPhase1QBExpr((ASTNode) ast.getChild(1), qbexpr2, id + SUBQUERY_TAG_2, alias
+ + SUBQUERY_TAG_2, insideView);
+ qbexpr.setQBExpr2(qbexpr2);
+ }
+ }
+
+ private LinkedHashMap<String, ASTNode> doPhase1GetAggregationsFromSelect(
+ ASTNode selExpr, QB qb, String dest) throws SemanticException {
+
+ // Iterate over the selects search for aggregation Trees.
+ // Use String as keys to eliminate duplicate trees.
+ LinkedHashMap<String, ASTNode> aggregationTrees = new LinkedHashMap<String, ASTNode>();
+ List<ASTNode> wdwFns = new ArrayList<ASTNode>();
+ for (int i = 0; i < selExpr.getChildCount(); ++i) {
+ ASTNode function = (ASTNode) selExpr.getChild(i);
+ if (function.getType() == HiveParser.TOK_SELEXPR ||
+ function.getType() == HiveParser.TOK_SUBQUERY_EXPR) {
+ function = (ASTNode)function.getChild(0);
+ }
+ doPhase1GetAllAggregations(function, aggregationTrees, wdwFns);
+ }
+
+ // window based aggregations are handled differently
+ for (ASTNode wdwFn : wdwFns) {
+ WindowingSpec spec = qb.getWindowingSpec(dest);
+ if(spec == null) {
+ queryProperties.setHasWindowing(true);
+ spec = new WindowingSpec();
+ qb.addDestToWindowingSpec(dest, spec);
+ }
+ HashMap<String, ASTNode> wExprsInDest = qb.getParseInfo().getWindowingExprsForClause(dest);
+ int wColIdx = spec.getWindowExpressions() == null ? 0 : spec.getWindowExpressions().size();
+ WindowFunctionSpec wFnSpec = processWindowFunction(wdwFn,
+ (ASTNode)wdwFn.getChild(wdwFn.getChildCount()-1));
+ // If this is a duplicate invocation of a function; don't add to WindowingSpec.
+ if ( wExprsInDest != null &&
+ wExprsInDest.containsKey(wFnSpec.getExpression().toStringTree())) {
+ continue;
+ }
+ wFnSpec.setAlias(wFnSpec.getName() + "_window_" + wColIdx);
+ spec.addWindowFunction(wFnSpec);
+ qb.getParseInfo().addWindowingExprToClause(dest, wFnSpec.getExpression());
+ }
+
+ return aggregationTrees;
+ }
+
+ private void doPhase1GetColumnAliasesFromSelect(
+ ASTNode selectExpr, QBParseInfo qbp) {
+ for (int i = 0; i < selectExpr.getChildCount(); ++i) {
+ ASTNode selExpr = (ASTNode) selectExpr.getChild(i);
+ if ((selExpr.getToken().getType() == HiveParser.TOK_SELEXPR)
+ && (selExpr.getChildCount() == 2)) {
+ String columnAlias = unescapeIdentifier(selExpr.getChild(1).getText());
+ qbp.setExprToColumnAlias((ASTNode) selExpr.getChild(0), columnAlias);
+ }
+ }
+ }
+
+ /**
+ * DFS-scan the expressionTree to find all aggregation subtrees and put them
+ * in aggregations.
+ *
+ * @param expressionTree
+ * @param aggregations
+ * the key to the HashTable is the toStringTree() representation of
+ * the aggregation subtree.
+ * @throws SemanticException
+ */
+ private void doPhase1GetAllAggregations(ASTNode expressionTree,
+ HashMap<String, ASTNode> aggregations, List<ASTNode> wdwFns) throws SemanticException {
+ int exprTokenType = expressionTree.getToken().getType();
+ if(exprTokenType == HiveParser.TOK_SUBQUERY_EXPR) {
+ //since now we have scalar subqueries we can get subquery expression in having
+ // we don't want to include aggregate from within subquery
+ return;
+ }
+
+ if (exprTokenType == HiveParser.TOK_FUNCTION
+ || exprTokenType == HiveParser.TOK_FUNCTIONDI
+ || exprTokenType == HiveParser.TOK_FUNCTIONSTAR) {
+ assert (expressionTree.getChildCount() != 0);
+ if (expressionTree.getChild(expressionTree.getChildCount()-1).getType()
+ == HiveParser.TOK_WINDOWSPEC) {
+ // If it is a windowing spec, we include it in the list
+ // Further, we will examine its children AST nodes to check whether
+ // there are aggregation functions within
+ wdwFns.add(expressionTree);
+ doPhase1GetAllAggregations((ASTNode) expressionTree.getChild(expressionTree.getChildCount()-1),
+ aggregations, wdwFns);
+ return;
+ }
+ if (expressionTree.getChild(0).getType() == HiveParser.Identifier) {
+ String functionName = unescapeIdentifier(expressionTree.getChild(0)
+ .getText());
+ // Validate the function name
+ if (FunctionRegistry.getFunctionInfo(functionName) == null) {
+ throw new SemanticException(ErrorMsg.INVALID_FUNCTION.getMsg(functionName));
+ }
+ if(FunctionRegistry.impliesOrder(functionName)) {
+ throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName));
+ }
+ if (FunctionRegistry.getGenericUDAFResolver(functionName) != null) {
+ if(containsLeadLagUDF(expressionTree)) {
+ throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName));
+ }
+ aggregations.put(expressionTree.toStringTree(), expressionTree);
+ FunctionInfo fi = FunctionRegistry.getFunctionInfo(functionName);
+ if (!fi.isNative()) {
+ unparseTranslator.addIdentifierTranslation((ASTNode) expressionTree
+ .getChild(0));
+ }
+ return;
+ }
+ }
+ }
+ for (int i = 0; i < expressionTree.getChildCount(); i++) {
+ doPhase1GetAllAggregations((ASTNode) expressionTree.getChild(i),
+ aggregations, wdwFns);
+ }
+ }
+
+ private List<ASTNode> doPhase1GetDistinctFuncExprs(
+ HashMap<String, ASTNode> aggregationTrees) throws SemanticException {
+ List<ASTNode> exprs = new ArrayList<ASTNode>();
+ for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
+ ASTNode value = entry.getValue();
+ assert (value != null);
+ if (value.getToken().getType() == HiveParser.TOK_FUNCTIONDI) {
+ exprs.add(value);
+ }
+ }
+ return exprs;
+ }
+
+ public static String generateErrorMessage(ASTNode ast, String message) {
+ StringBuilder sb = new StringBuilder();
+ if (ast == null) {
+ sb.append(message).append(". Cannot tell the position of null AST.");
+ return sb.toString();
+ }
+ sb.append(ast.getLine());
+ sb.append(":");
+ sb.append(ast.getCharPositionInLine());
+ sb.append(" ");
+ sb.append(message);
+ sb.append(". Error encountered near token '");
+ sb.append(ErrorMsg.getText(ast));
+ sb.append("'");
+ return sb.toString();
+ }
+
+ ASTNode getAST() {
+ return this.ast;
+ }
+
+ protected void setAST(ASTNode newAST) {
+ this.ast = newAST;
+ }
+
+ int[] findTabRefIdxs(ASTNode tabref) {
+ assert tabref.getType() == HiveParser.TOK_TABREF;
+ int aliasIndex = 0;
+ int propsIndex = -1;
+ int tsampleIndex = -1;
+ int ssampleIndex = -1;
+ for (int index = 1; index < tabref.getChildCount(); index++) {
+ ASTNode ct = (ASTNode) tabref.getChild(index);
+ if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) {
+ tsampleIndex = index;
+ } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) {
+ ssampleIndex = index;
+ } else if (ct.getToken().getType() == HiveParser.TOK_TABLEPROPERTIES) {
+ propsIndex = index;
+ } else {
+ aliasIndex = index;
+ }
+ }
+ return new int[] {aliasIndex, propsIndex, tsampleIndex, ssampleIndex};
+ }
+ String findSimpleTableName(ASTNode tabref, int aliasIndex) {
+ assert tabref.getType() == HiveParser.TOK_TABREF;
+ ASTNode tableTree = (ASTNode) (tabref.getChild(0));
+
+ String alias;
+ if (aliasIndex != 0) {
+ alias = unescapeIdentifier(tabref.getChild(aliasIndex).getText());
+ }
+ else {
+ alias = getUnescapedUnqualifiedTableName(tableTree);
+ }
+ return alias;
+ }
+ /**
+ * Goes though the tabref tree and finds the alias for the table. Once found,
+ * it records the table name-> alias association in aliasToTabs. It also makes
+ * an association from the alias to the table AST in parse info.
+ *
+ * @return the alias of the table
+ */
+ private String processTable(QB qb, ASTNode tabref) throws SemanticException {
+ // For each table reference get the table name
+ // and the alias (if alias is not present, the table name
+ // is used as an alias)
+ int[] indexes = findTabRefIdxs(tabref);
+ int aliasIndex = indexes[0];
+ int propsIndex = indexes[1];
+ int tsampleIndex = indexes[2];
+ int ssampleIndex = indexes[3];
+
+ ASTNode tableTree = (ASTNode) (tabref.getChild(0));
+
+ String tabIdName = getUnescapedName(tableTree).toLowerCase();
+
+ String alias = findSimpleTableName(tabref, aliasIndex);
+
+ if (propsIndex >= 0) {
+ Tree propsAST = tabref.getChild(propsIndex);
+ Map<String, String> props = DDLSemanticAnalyzer.getProps((ASTNode) propsAST.getChild(0));
+ // We get the information from Calcite.
+ if ("TRUE".equals(props.get("insideView"))) {
+ qb.getAliasInsideView().add(alias.toLowerCase());
+ }
+ qb.setTabProps(alias, props);
+ }
+
+ // If the alias is already there then we have a conflict
+ if (qb.exists(alias)) {
+ throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(tabref
+ .getChild(aliasIndex)));
+ }
+ if (tsampleIndex >= 0) {
+ ASTNode sampleClause = (ASTNode) tabref.getChild(tsampleIndex);
+ ArrayList<ASTNode> sampleCols = new ArrayList<ASTNode>();
+ if (sampleClause.getChildCount() > 2) {
+ for (int i = 2; i < sampleClause.getChildCount(); i++) {
+ sampleCols.add((ASTNode) sampleClause.getChild(i));
+ }
+ }
+ // TODO: For now only support sampling on up to two columns
+ // Need to change it to list of columns
+ if (sampleCols.size() > 2) {
+ throw new SemanticException(generateErrorMessage(
+ (ASTNode) tabref.getChild(0),
+ ErrorMsg.SAMPLE_RESTRICTION.getMsg()));
+ }
+ TableSample tabSample = new TableSample(
+ unescapeIdentifier(sampleClause.getChild(0).getText()),
+ unescapeIdentifier(sampleClause.getChild(1).getText()),
+ sampleCols);
+ qb.getParseInfo().setTabSample(alias, tabSample);
+ if (unparseTranslator.isEnabled()) {
+ for (ASTNode sampleCol : sampleCols) {
+ unparseTranslator.addIdentifierTranslation((ASTNode) sampleCol
+ .getChild(0));
+ }
+ }
+ } else if (ssampleIndex >= 0) {
+ ASTNode sampleClause = (ASTNode) tabref.getChild(ssampleIndex);
+
+ Tree type = sampleClause.getChild(0);
+ Tree numerator = sampleClause.getChild(1);
+ String value = unescapeIdentifier(numerator.getText());
+
+
+ SplitSample sample;
+ if (type.getType() == HiveParser.TOK_PERCENT) {
+ assertCombineInputFormat(numerator, "Percentage");
+ Double percent = Double.valueOf(value).doubleValue();
+ if (percent < 0 || percent > 100) {
+ throw new SemanticException(generateErrorMessage((ASTNode) numerator,
+ "Sampling percentage should be between 0 and 100"));
+ }
+ int seedNum = conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM);
+ sample = new SplitSample(percent, seedNum);
+ } else if (type.getType() == HiveParser.TOK_ROWCOUNT) {
+ sample = new SplitSample(Integer.parseInt(value));
+ } else {
+ assert type.getType() == HiveParser.TOK_LENGTH;
+ assertCombineInputFormat(numerator, "Total Length");
+ long length = Integer.parseInt(value.substring(0, value.length() - 1));
+ char last = value.charAt(value.length() - 1);
+ if (last == 'k' || last == 'K') {
+ length <<= 10;
+ } else if (last == 'm' || last == 'M') {
+ length <<= 20;
+ } else if (last == 'g' || last == 'G') {
+ length <<= 30;
+ }
+ int seedNum = conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM);
+ sample = new SplitSample(length, seedNum);
+ }
+ String alias_id = getAliasId(alias, qb);
+ nameToSplitSample.put(alias_id, sample);
+ }
+ // Insert this map into the stats
+ qb.setTabAlias(alias, tabIdName);
+ if (qb.isInsideView()) {
+ qb.getAliasInsideView().add(alias.toLowerCase());
+ }
+ qb.addAlias(alias);
+
+ qb.getParseInfo().setSrcForAlias(alias, tableTree);
+
+ // if alias to CTE contains the table name, we do not do the translation because
+ // cte is actually a subquery.
+ if (!this.aliasToCTEs.containsKey(tabIdName)) {
+ unparseTranslator.addTableNameTranslation(tableTree, SessionState.get().getCurrentDatabase());
+ if (aliasIndex != 0) {
+ unparseTranslator.addIdentifierTranslation((ASTNode) tabref.getChild(aliasIndex));
+ }
+ }
+
+ return alias;
+ }
+
+ Map<String, SplitSample> getNameToSplitSampleMap() {
+ return this.nameToSplitSample;
+ }
+
+ /**
+ * Convert a string to Text format and write its bytes in the same way TextOutputFormat would do.
+ * This is needed to properly encode non-ascii characters.
+ */
+ private static void writeAsText(String text, FSDataOutputStream out) throws IOException {
+ Text to = new Text(text);
+ out.write(to.getBytes(), 0, to.getLength());
+ }
+
+ /**
+ * Generate a temp table out of a values clause
+ * See also {@link #preProcessForInsert(ASTNode, QB)}
+ */
+ private ASTNode genValuesTempTable(ASTNode originalFrom, QB qb) throws SemanticException {
+ Path dataDir = null;
+ if(!qb.getEncryptedTargetTablePaths().isEmpty()) {
+ //currently only Insert into T values(...) is supported thus only 1 values clause
+ //and only 1 target table are possible. If/when support for
+ //select ... from values(...) is added an insert statement may have multiple
+ //encrypted target tables.
+ dataDir = ctx.getMRTmpPath(qb.getEncryptedTargetTablePaths().get(0).toUri());
+ }
+ // Pick a name for the table
+ SessionState ss = SessionState.get();
+ String tableName = VALUES_TMP_TABLE_NAME_PREFIX + ss.getNextValuesTempTableSuffix();
+
+ // Step 1, parse the values clause we were handed
+ List<? extends Node> fromChildren = originalFrom.getChildren();
+ // First child should be the virtual table ref
+ ASTNode virtualTableRef = (ASTNode)fromChildren.get(0);
+ assert virtualTableRef.getToken().getType() == HiveParser.TOK_VIRTUAL_TABREF :
+ "Expected first child of TOK_VIRTUAL_TABLE to be TOK_VIRTUAL_TABREF but was " +
+ virtualTableRef.getName();
+
+ List<? extends Node> virtualTableRefChildren = virtualTableRef.getChildren();
+ // First child of this should be the table name. If it's anonymous,
+ // then we don't have a table name.
+ ASTNode tabName = (ASTNode)virtualTableRefChildren.get(0);
+ if (tabName.getToken().getType() != HiveParser.TOK_ANONYMOUS) {
+ // TODO, if you want to make select ... from (values(...) as foo(...) work,
+ // you need to parse this list of columns names and build it into the table
+ throw new SemanticException(ErrorMsg.VALUES_TABLE_CONSTRUCTOR_NOT_SUPPORTED.getMsg());
+ }
+
+ // The second child of the TOK_VIRTUAL_TABLE should be TOK_VALUES_TABLE
+ ASTNode valuesTable = (ASTNode)fromChildren.get(1);
+ assert valuesTable.getToken().getType() == HiveParser.TOK_VALUES_TABLE :
+ "Expected second child of TOK_VIRTUAL_TABLE to be TOK_VALUE_TABLE but was " +
+ valuesTable.getName();
+ // Each of the children of TOK_VALUES_TABLE will be a TOK_VALUE_ROW
+ List<? extends Node> valuesTableChildren = valuesTable.getChildren();
+
+ // Now that we're going to start reading through the rows, open a file to write the rows too
+ // If we leave this method before creating the temporary table we need to be sure to clean up
+ // this file.
+ Path tablePath = null;
+ FileSystem fs = null;
+ FSDataOutputStream out = null;
+ try {
+ if(dataDir == null) {
+ tablePath = Warehouse.getDnsPath(new Path(ss.getTempTableSpace(), tableName), conf);
+ }
+ else {
+ //if target table of insert is encrypted, make sure temporary table data is stored
+ //similarly encrypted
+ tablePath = Warehouse.getDnsPath(new Path(dataDir, tableName), conf);
+ }
+ fs = tablePath.getFileSystem(conf);
+ fs.mkdirs(tablePath);
+ Path dataFile = new Path(tablePath, "data_file");
+ out = fs.create(dataFile);
+ List<FieldSchema> fields = new ArrayList<FieldSchema>();
+
+ boolean firstRow = true;
+ for (Node n : valuesTableChildren) {
+ ASTNode valuesRow = (ASTNode) n;
+ assert valuesRow.getToken().getType() == HiveParser.TOK_VALUE_ROW :
+ "Expected child of TOK_VALUE_TABLE to be TOK_VALUE_ROW but was " + valuesRow.getName();
+ // Each of the children of this should be a literal
+ List<? extends Node> valuesRowChildren = valuesRow.getChildren();
+ boolean isFirst = true;
+ int nextColNum = 1;
+ for (Node n1 : valuesRowChildren) {
+ ASTNode value = (ASTNode) n1;
+ if (firstRow) {
+ fields.add(new FieldSchema("tmp_values_col" + nextColNum++, "string", ""));
+ }
+ if (isFirst) isFirst = false;
+ else writeAsText("\u0001", out);
+ writeAsText(unparseExprForValuesClause(value), out);
+ }
+ writeAsText("\n", out);
+ firstRow = false;
+ }
+
+ // Step 2, create a temp table, using the created file as the data
+ StorageFormat format = new StorageFormat(conf);
+ format.processStorageFormat("TextFile");
+ Table table = db.newTable(tableName);
+ table.setSerializationLib(format.getSerde());
+ table.setFields(fields);
+ table.setDataLocation(tablePath);
+ table.getTTable().setTemporary(true);
+ table.setStoredAsSubDirectories(false);
+ table.setInputFormatClass(format.getInputFormat());
+ table.setOutputFormatClass(format.getOutputFormat());
+ db.createTable(table, false);
+ } catch (Exception e) {
+ String errMsg = ErrorMsg.INSERT_CANNOT_CREATE_TEMP_FILE.getMsg() + e.getMessage();
+ LOG.error(errMsg);
+ // Try to delete the file
+ if (fs != null && tablePath != null) {
+ try {
+ fs.delete(tablePath, false);
+ } catch (IOException swallowIt) {}
+ }
+ throw new SemanticException(errMsg, e);
+ } finally {
+ IOUtils.closeStream(out);
+ }
+
+ // Step 3, return a new subtree with a from clause built around that temp table
+ // The form of the tree is TOK_TABREF->TOK_TABNAME->identifier(tablename)
+ Token t = new ClassicToken(HiveParser.TOK_TABREF);
+ ASTNode tabRef = new ASTNode(t);
+ t = new ClassicToken(HiveParser.TOK_TABNAME);
+ ASTNode tabNameNode = new ASTNode(t);
+ tabRef.addChild(tabNameNode);
+ t = new ClassicToken(HiveParser.Identifier, tableName);
+ ASTNode identifier = new ASTNode(t);
+ tabNameNode.addChild(identifier);
+ return tabRef;
+ }
+
+ // Take an expression in the values clause and turn it back into a string. This is far from
+ // comprehensive. At the moment it only supports:
+ // * literals (all types)
+ // * unary negatives
+ // * true/false
+ private String unparseExprForValuesClause(ASTNode expr) throws SemanticException {
+ switch (expr.getToken().getType()) {
+ case HiveParser.Number:
+ return expr.getText();
+
+ case HiveParser.StringLiteral:
+ return BaseSemanticAnalyzer.unescapeSQLString(expr.getText());
+
+ case HiveParser.KW_FALSE:
+ // UDFToBoolean casts any non-empty string to true, so set this to false
+ return "";
+
+ case HiveParser.KW_TRUE:
+ return "TRUE";
+
+ case HiveParser.MINUS:
+ return "-" + unparseExprForValuesClause((ASTNode)expr.getChildren().get(0));
+
+ case HiveParser.TOK_NULL:
+ // Hive's text input will translate this as a null
+ return "\\N";
+
+ default:
+ throw new SemanticException("Expression of type " + expr.getText() +
+ " not supported in insert/values");
+ }
+
+ }
+
+ private void assertCombineInputFormat(Tree numerator, String message) throws SemanticException {
+ String inputFormat = conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") ?
+ HiveConf.getVar(conf, HiveConf.ConfVars.HIVETEZINPUTFORMAT):
+ HiveConf.getVar(conf, HiveConf.ConfVars.HIVEINPUTFORMAT);
+ if (!inputFormat.equals(CombineHiveInputFormat.class.getName())) {
+ throw new SemanticException(generateErrorMessage((ASTNode) numerator,
+ message + " sampling is not supported in " + inputFormat));
+ }
+ }
+
+ private String processSubQuery(QB qb, ASTNode subq) throws SemanticException {
+
+ // This is a subquery and must have an alias
+ if (subq.getChildCount() != 2) {
+ throw new SemanticException(ErrorMsg.NO_SUBQUERY_ALIAS.getMsg(subq));
+ }
+ ASTNode subqref = (ASTNode) subq.getChild(0);
+ String alias = unescapeIdentifier(subq.getChild(1).getText());
+
+ // Recursively do the first phase of semantic analysis for the subquery
+ QBExpr qbexpr = new QBExpr(alias);
+
+ doPhase1QBExpr(subqref, qbexpr, qb.getId(), alias, qb.isInsideView());
+
+ // If the alias is already there then we have a conflict
+ if (qb.exists(alias)) {
+ throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(subq
+ .getChild(1)));
+ }
+ // Insert this map into the stats
+ qb.setSubqAlias(alias, qbexpr);
+ qb.addAlias(alias);
+
+ unparseTranslator.addIdentifierTranslation((ASTNode) subq.getChild(1));
+
+ return alias;
+ }
+
+ /*
+ * Phase1: hold onto any CTE definitions in aliasToCTE.
+ * CTE definitions are global to the Query.
+ */
+ private void processCTE(QB qb, ASTNode ctes) throws SemanticException {
+
+ int numCTEs = ctes.getChildCount();
+
+ for(int i=0; i <numCTEs; i++) {
+ ASTNode cte = (ASTNode) ctes.getChild(i);
+ ASTNode cteQry = (ASTNode) cte.getChild(0);
+ String alias = unescapeIdentifier(cte.getChild(1).getText());
+
+ String qName = qb.getId() == null ? "" : qb.getId() + ":";
+ qName += alias.toLowerCase();
+
+ if ( aliasToCTEs.containsKey(qName)) {
+ throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(cte.getChild(1)));
+ }
+ aliasToCTEs.put(qName, new CTEClause(qName, cteQry));
+ }
+ }
+
+ /*
+ * We allow CTE definitions in views. So we can end up with a hierarchy of CTE definitions:
+ * - at the top level of a query statement
+ * - where a view is referenced.
+ * - views may refer to other views.
+ *
+ * The scoping rules we use are: to search for a CTE from the current QB outwards. In order to
+ * disambiguate between CTES are different levels we qualify(prefix) them with the id of the QB
+ * they appear in when adding them to the <code>aliasToCTEs</code> map.
+ *
+ */
+ private CTEClause findCTEFromName(QB qb, String cteName) {
+ StringBuilder qId = new StringBuilder();
+ if (qb.getId() != null) {
+ qId.append(qb.getId());
+ }
+
+ while (qId.length() > 0) {
+ String nm = qId + ":" + cteName;
+ CTEClause cte = aliasToCTEs.get(nm);
+ if (cte != null) {
+ return cte;
+ }
+ int lastIndex = qId.lastIndexOf(":");
+ lastIndex = lastIndex < 0 ? 0 : lastIndex;
+ qId.setLength(lastIndex);
+ }
+ return aliasToCTEs.get(cteName);
+ }
+
+ /*
+ * If a CTE is referenced in a QueryBlock:
+ * - add it as a SubQuery for now.
+ * - SQ.alias is the alias used in QB. (if no alias is specified,
+ * it used the CTE name. Works just like table references)
+ * - Adding SQ done by:
+ * - copying AST of CTE
+ * - setting ASTOrigin on cloned AST.
+ * - trigger phase 1 on new QBExpr.
+ * - update QB data structs: remove this as a table reference, move it to a SQ invocation.
+ */
+ private void addCTEAsSubQuery(QB qb, String cteName, String cteAlias)
+ throws SemanticException {
+ cteAlias = cteAlias == null ? cteName : cteAlias;
+ CTEClause cte = findCTEFromName(qb, cteName);
+ ASTNode cteQryNode = cte.cteNode;
+ QBExpr cteQBExpr = new QBExpr(cteAlias);
+ doPhase1QBExpr(cteQryNode, cteQBExpr, qb.getId(), cteAlias);
+ qb.rewriteCTEToSubq(cteAlias, cteName, cteQBExpr);
+ }
+
+ private final CTEClause rootClause = new CTEClause(null, null);
+
+ @Override
+ public List<Task<? extends Serializable>> getAllRootTasks() {
+ if (!rootTasksResolved) {
+ rootTasks = toRealRootTasks(rootClause.asExecutionOrder());
+ rootTasksResolved = true;
+ }
+ return rootTasks;
+ }
+
+ @Override
+ public HashSet<ReadEntity> getAllInputs() {
+ HashSet<ReadEntity> readEntities = new HashSet<ReadEntity>(getInputs());
+ for (CTEClause cte : rootClause.asExecutionOrder()) {
+ if (cte.source != null) {
+ readEntities.addAll(cte.source.getInputs());
+ }
+ }
+ return readEntities;
+ }
+
+ @Override
+ public HashSet<WriteEntity> getAllOutputs() {
+ HashSet<WriteEntity> writeEntities = new HashSet<WriteEntity>(getOutputs());
+ for (CTEClause cte : rootClause.asExecutionOrder()) {
+ if (cte.source != null) {
+ writeEntities.addAll(cte.source.getOutputs());
+ }
+ }
+ return writeEntities;
+ }
+
+ class CTEClause {
+ CTEClause(String alias, ASTNode cteNode) {
+ this.alias = alias;
+ this.cteNode = cteNode;
+ }
+ String alias;
+ ASTNode cteNode;
+ boolean materialize;
+ int reference;
+ QBExpr qbExpr;
+ List<CTEClause> parents = new ArrayList<CTEClause>();
+
+ // materialized
+ Table table;
+ SemanticAnalyzer source;
+
+ List<Task<? extends Serializable>> getTasks() {
+ return source == null ? null : source.rootTasks;
+ }
+
+ List<CTEClause> asExecutionOrder() {
+ List<CTEClause> execution = new ArrayList<CTEClause>();
+ asExecutionOrder(new HashSet<CTEClause>(), execution);
+ return execution;
+ }
+
+ void asExecutionOrder(Set<CTEClause> visited, List<CTEClause> execution) {
+ for (CTEClause parent : parents) {
+ if (visited.add(parent)) {
+ parent.asExecutionOrder(visited, execution);
+ }
+ }
+ execution.add(this);
+ }
+
+ @Override
+ public String toString() {
+ return alias == null ? "<root>" : alias;
+ }
+ }
+
+ private List<Task<? extends Serializable>> toRealRootTasks(List<CTEClause> execution) {
+ List<Task<? extends Serializable>> cteRoots = new ArrayList<>();
+ List<Task<? extends Serializable>> cteLeafs = new ArrayList<>();
+ List<Task<? extends Serializable>> curTopRoots = null;
+ List<Task<? extends Serializable>> curBottomLeafs = null;
+ for (int i = 0; i < execution.size(); i++) {
+ CTEClause current = execution.get(i);
+ if (current.parents.isEmpty() && curTopRoots != null) {
+ cteRoots.addAll(curTopRoots);
+ cteLeafs.addAll(curBottomLeafs);
+ curTopRoots = curBottomLeafs = null;
+ }
+ List<Task<? extends Serializable>> curTasks = current.getTasks();
+ if (curTasks == null) {
+ continue;
+ }
+ if (curTopRoots == null) {
+ curTopRoots = curTasks;
+ }
+ if (curBottomLeafs != null) {
+ for (Task<?> topLeafTask : curBottomLeafs) {
+ for (Task<?> currentRootTask : curTasks) {
+ topLeafTask.addDependentTask(currentRootTask);
+ }
+ }
+ }
+ curBottomLeafs = Task.findLeafs(curTasks);
+ }
+ if (curTopRoots != null) {
+ cteRoots.addAll(curTopRoots);
+ cteLeafs.addAll(curBottomLeafs);
+ }
+
+ if (cteRoots.isEmpty()) {
+ return rootTasks;
+ }
+ for (Task<?> cteLeafTask : cteLeafs) {
+ for (Task<?> mainRootTask : rootTasks) {
+ cteLeafTask.addDependentTask(mainRootTask);
+ }
+ }
+ return cteRoots;
+ }
+
+ Table materializeCTE(String cteName, CTEClause cte) throws HiveException {
+
+ ASTNode createTable = new ASTNode(new ClassicToken(HiveParser.TOK_CREATETABLE));
+
+ ASTNode tableName = new ASTNode(new ClassicToken(HiveParser.TOK_TABNAME));
+ tableName.addChild(new ASTNode(new ClassicToken(HiveParser.Identifier, cteName)));
+
+ ASTNode temporary = new ASTNode(new ClassicToken(HiveParser.KW_TEMPORARY, MATERIALIZATION_MARKER));
+
+ createTable.addChild(tableName);
+ createTable.addChild(temporary);
+ createTable.addChild(cte.cteNode);
+
+ SemanticAnalyzer analyzer = new SemanticAnalyzer(queryState);
+ analyzer.initCtx(ctx);
+ analyzer.init(false);
+
+ // should share cte contexts
+ analyzer.aliasToCTEs.putAll(aliasToCTEs);
+
+ HiveOperation operation = queryState.getHiveOperation();
+ try {
+ analyzer.analyzeInternal(createTable);
+ } finally {
+ queryState.setCommandType(operation);
+ }
+
+ Table table = analyzer.tableDesc.toTable(conf);
+ Path location = table.getDataLocation();
+ try {
+ location.getFileSystem(conf).mkdirs(location);
+ } catch (IOException e) {
+ throw new HiveException(e);
+ }
+ table.setMaterializedTable(true);
+
+ LOG.info(cteName + " will be materialized into " + location);
+ cte.table = table;
+ cte.source = analyzer;
+
+ ctx.addMaterializedTable(cteName, table);
+
+ return table;
+ }
+
+
+ static boolean isJoinToken(ASTNode node) {
+ if ((node.getToken().getType() == HiveParser.TOK_JOIN)
+ || (node.getToken().getType() == HiveParser.TOK_CROSSJOIN)
+ || isOuterJoinToken(node)
+ || (node.getToken().getType() == HiveParser.TOK_LEFTSEMIJOIN)
+ || (node.getToken().getType() == HiveParser.TOK_UNIQUEJOIN)) {
+ return true;
+ }
+
+ return false;
+ }
+
+ static private boolean isOuterJoinToken(ASTNode node) {
+ return (node.getToken().getType() == HiveParser.TOK_LEFTOUTERJOIN)
+ || (node.getToken().getType() == HiveParser.TOK_RIGHTOUTERJOIN)
+ || (node.getToken().getType() == HiveParser.TOK_FULLOUTERJOIN);
+ }
+
+ /**
+ * Given the AST with TOK_JOIN as the root, get all the aliases for the tables
+ * or subqueries in the join.
+ *
+ * @param qb
+ * @param join
+ * @throws SemanticException
+ */
+ @SuppressWarnings("nls")
+ private void processJoin(QB qb, ASTNode join) throws SemanticException {
+ int numChildren = join.getChildCount();
+ if ((numChildren != 2) && (numChildren != 3)
+ && join.getToken().getType() != HiveParser.TOK_UNIQUEJOIN) {
+ throw new SemanticException(generateErrorMessage(join,
+ "Join with multiple children"));
+ }
+
+ queryProperties.incrementJoinCount(isOuterJoinToken(join));
+ for (int num = 0; num < numChildren; num++) {
+ ASTNode child = (ASTNode) join.getChild(num);
+ if (child.getToken().getType() == HiveParser.TOK_TABREF) {
+ processTable(qb, child);
+ } else if (child.getToken().getType() == HiveParser.TOK_SUBQUERY) {
+ processSubQuery(qb, child);
+ } else if (child.getToken().getType() == HiveParser.TOK_PTBLFUNCTION) {
+ queryProperties.setHasPTF(true);
+ processPTF(qb, child);
+ PTFInvocationSpec ptfInvocationSpec = qb.getPTFInvocationSpec(child);
+ String inputAlias = ptfInvocationSpec == null ? null :
+ ptfInvocationSpec.getFunction().getAlias();;
+ if ( inputAlias == null ) {
+ throw new SemanticException(generateErrorMessage(child,
+ "PTF invocation in a Join must have an alias"));
+ }
+
+ } else if (child.getToken().getType() == HiveParser.TOK_LATERAL_VIEW ||
+ child.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) {
+ // SELECT * FROM src1 LATERAL VIEW udtf() AS myTable JOIN src2 ...
+ // is not supported. Instead, the lateral view must be in a subquery
+ // SELECT * FROM (SELECT * FROM src1 LATERAL VIEW udtf() AS myTable) a
+ // JOIN src2 ...
+ throw new SemanticException(ErrorMsg.LATERAL_VIEW_WITH_JOIN
+ .getMsg(join));
+ } else if (isJoinToken(child)) {
+ processJoin(qb, child);
+ }
+ }
+ }
+
+ /**
+ * Given the AST with TOK_LATERAL_VIEW as the root, get the alias for the
+ * table or subquery in the lateral view and also make a mapping from the
+ * alias to all the lateral view AST's.
+ *
+ * @param qb
+ * @param lateralView
+ * @return the alias for the table/subquery
+ * @throws SemanticException
+ */
+
+ private String processLateralView(QB qb, ASTNode lateralView)
+ throws SemanticException {
+ int numChildren = lateralView.getChildCount();
+
+ assert (numChildren == 2);
+ ASTNode next = (ASTNode) lateralView.getChild(1);
+
+ String alias = null;
+
+ switch (next.getToken().getType()) {
+ case HiveParser.TOK_TABREF:
+ alias = processTable(qb, next);
+ break;
+ case HiveParser.TOK_SUBQUERY:
+ alias = processSubQuery(qb, next);
+ break;
+ case HiveParser.TOK_LATERAL_VIEW:
+ case HiveParser.TOK_LATERAL_VIEW_OUTER:
+ alias = processLateralView(qb, next);
+ break;
+ default:
+ throw new SemanticException(ErrorMsg.LATERAL_VIEW_INVALID_CHILD
+ .getMsg(lateralView));
+ }
+ alias = alias.toLowerCase();
+ qb.getParseInfo().addLateralViewForAlias(alias, lateralView);
+ qb.addAlias(alias);
+ return alias;
+ }
+
+ /**
+ * Phase 1: (including, but not limited to):
+ *
+ * 1. Gets all the aliases for all the tables / subqueries and makes the
+ * appropriate mapping in aliasToTabs, aliasToSubq 2. Gets the location of the
+ * destination and names the clause "inclause" + i 3. Creates a map from a
+ * string representation of an aggregation tree to the actual aggregation AST
+ * 4. Creates a mapping from the clause name to the select expression AST in
+ * destToSelExpr 5. Creates a mapping from a table alias to the lateral view
+ * AST's in aliasToLateralViews
+ *
+ * @param ast
+ * @param qb
+ * @param ctx_1
+ * @throws SemanticException
+ */
+ @SuppressWarnings({"fallthrough", "nls"})
+ public boolean doPhase1(ASTNode ast, QB qb, Phase1Ctx ctx_1, PlannerContext plannerCtx)
+ throws SemanticException {
+
+ boolean phase1Result = true;
+ QBParseInfo qbp = qb.getParseInfo();
+ boolean skipRecursion = false;
+
+ if (ast.getToken() != null) {
+ skipRecursion = true;
+ switch (ast.getToken().getType()) {
+ case HiveParser.TOK_SELECTDI:
+ qb.countSelDi();
+ // fall through
+ case HiveParser.TOK_SELECT:
+ qb.countSel();
+ qbp.setSelExprForClause(ctx_1.dest, ast);
+
+ int posn = 0;
+ if (((ASTNode) ast.getChild(0)).getToken().getType() == HiveParser.QUERY_HINT) {
+ ParseDriver pd = new ParseDriver();
+ String queryHintStr = ast.getChild(0).getText();
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("QUERY HINT: "+queryHintStr);
+ }
+ try {
+ ASTNode hintNode = pd.parseHint(queryHintStr);
+ qbp.setHints((ASTNode) hintNode);
+ posn++;
+ } catch (ParseException e) {
+ throw new SemanticException("failed to parse query hint: "+e.getMessage(), e);
+ }
+ }
+
+ if ((ast.getChild(posn).getChild(0).getType() == HiveParser.TOK_TRANSFORM))
+ queryProperties.setUsesScript(true);
+
+ LinkedHashMap<String, ASTNode> aggregations = doPhase1GetAggregationsFromSelect(ast,
+ qb, ctx_1.dest);
+ doPhase1GetColumnAliasesFromSelect(ast, qbp);
+ qbp.setAggregationExprsForClause(ctx_1.dest, aggregations);
+ qbp.setDistinctFuncExprsForClause(ctx_1.dest,
+ doPhase1GetDistinctFuncExprs(aggregations));
+ break;
+
+ case HiveParser.TOK_WHERE:
+ qbp.setWhrExprForClause(ctx_1.dest, ast);
+ if (!SubQueryUtils.findSubQueries((ASTNode) ast.getChild(0)).isEmpty())
+ queryProperties.setFilterWithSubQuery(true);
+ break;
+
+ case HiveParser.TOK_INSERT_INTO:
+ String currentDatabase = SessionState.get().getCurrentDatabase();
+ String tab_name = getUnescapedName((ASTNode) ast.getChild(0).getChild(0), currentDatabase);
+ qbp.addInsertIntoTable(tab_name, ast);
+
+ case HiveParser.TOK_DESTINATION:
+ ctx_1.dest = this.ctx.getDestNamePrefix(ast).toString() + ctx_1.nextNum;
+ ctx_1.nextNum++;
+ boolean isTmpFileDest = false;
+ if (ast.getChildCount() > 0 && ast.getChild(0) instanceof ASTNode) {
+ ASTNode ch = (ASTNode) ast.getChild(0);
+ if (ch.getToken().getType() == HiveParser.TOK_DIR && ch.getChildCount() > 0
+ && ch.getChild(0) instanceof ASTNode) {
+ ch = (ASTNode) ch.getChild(0);
+ isTmpFileDest = ch.getToken().getType() == HiveParser.TOK_TMP_FILE;
+ } else {
+ if (ast.getToken().getType() == HiveParser.TOK_DESTINATION
+ && ast.getChild(0).getType() == HiveParser.TOK_TAB) {
+ String fullTableName = getUnescapedName((ASTNode) ast.getChild(0).getChild(0),
+ SessionState.get().getCurrentDatabase());
+ qbp.getInsertOverwriteTables().put(fullTableName, ast);
+ }
+ }
+ }
+
+ // is there a insert in the subquery
+ if (qbp.getIsSubQ() && !isTmpFileDest) {
+ throw new SemanticException(ErrorMsg.NO_INSERT_INSUBQUERY.getMsg(ast));
+ }
+
+ qbp.setDestForClause(ctx_1.dest, (ASTNode) ast.getChild(0));
+ handleInsertStatementSpecPhase1(ast, qbp, ctx_1);
+
+ if (qbp.getClauseNamesForDest().size() == 2) {
+ // From the moment that we have two destination clauses,
+ // we know that this is a multi-insert query.
+ // Thus, set property to right value.
+ // Using qbp.getClauseNamesForDest().size() >= 2 would be
+ // equivalent, but we use == to avoid setting the property
+ // multiple times
+ queryProperties.setMultiDestQuery(true);
+ }
+
+ if (plannerCtx != null && !queryProperties.hasMultiDestQuery()) {
+ plannerCtx.setInsertToken(ast, isTmpFileDest);
+ } else if (plannerCtx != null && qbp.getClauseNamesForDest().size() == 2) {
+ // For multi-insert query, currently we only optimize the FROM clause.
+ // Hence, introduce multi-insert token on top of it.
+ // However, first we need to reset existing token (insert).
+ // Using qbp.getClauseNamesForDest().size() >= 2 would be
+ // equivalent, but we use == to avoid setting the property
+ // multiple times
+ plannerCtx.resetToken();
+ plannerCtx.setMultiInsertToken((ASTNode) qbp.getQueryFrom().getChild(0));
+ }
+ break;
+
+ case HiveParser.TOK_FROM:
+ int child_count = ast.getChildCount();
+ if (child_count != 1) {
+ throw new SemanticException(generateErrorMessage(ast,
+ "Multiple Children " + child_count));
+ }
+
+ if (!qbp.getIsSubQ()) {
+ qbp.setQueryFromExpr(ast);
+ }
+
+ // Check if this is a subquery / lateral view
+ ASTNode frm = (ASTNode) ast.getChild(0);
+ if (frm.getToken().getType() == HiveParser.TOK_TABREF) {
+ processTable(qb, frm);
+ } else if (frm.getToken().getType() == HiveParser.TOK_VIRTUAL_TABLE) {
+ // Create a temp table with the passed values in it then rewrite this portion of the
+ // tree to be from that table.
+ ASTNode newFrom = genValuesTempTable(frm, qb);
+ ast.setChild(0, newFrom);
+ processTable(qb, newFrom);
+ } else if (frm.getToken().getType() == HiveParser.TOK_SUBQUERY) {
+ processSubQuery(qb, frm);
+ } else if (frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW ||
+ frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) {
+ queryProperties.setHasLateralViews(true);
+ processLateralView(qb, frm);
+ } else if (isJoinToken(frm)) {
+ processJoin(qb, frm);
+ qbp.setJoinExpr(frm);
+ }else if(frm.getToken().getType() == HiveParser.TOK_PTBLFUNCTION){
+ queryProperties.setHasPTF(true);
+ processPTF(qb, frm);
+ }
+ break;
+
+ case HiveParser.TOK_CLUSTERBY:
+ // Get the clusterby aliases - these are aliased to the entries in the
+ // select list
+ queryProperties.setHasClusterBy(true);
+ qbp.setClusterByExprForClause(ctx_1.dest, ast);
+ break;
+
+ case HiveParser.TOK_DISTRIBUTEBY:
+ // Get the distribute by aliases - these are aliased to the entries in
+ // the
+ // select list
+ queryProperties.setHasDistributeBy(true);
+ qbp.setDistributeByExprForClause(ctx_1.dest, ast);
+ if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.CLUSTERBY_DISTRIBUTEBY_CONFLICT.getMsg()));
+ } else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.ORDERBY_DISTRIBUTEBY_CONFLICT.getMsg()));
+ }
+ break;
+
+ case HiveParser.TOK_SORTBY:
+ // Get the sort by aliases - these are aliased to the entries in the
+ // select list
+ queryProperties.setHasSortBy(true);
+ qbp.setSortByExprForClause(ctx_1.dest, ast);
+ if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.CLUSTERBY_SORTBY_CONFLICT.getMsg()));
+ } else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.ORDERBY_SORTBY_CONFLICT.getMsg()));
+ }
+
+ break;
+
+ case HiveParser.TOK_ORDERBY:
+ // Get the order by aliases - these are aliased to the entries in the
+ // select list
+ queryProperties.setHasOrderBy(true);
+ qbp.setOrderByExprForClause(ctx_1.dest, ast);
+ if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.CLUSTERBY_ORDERBY_CONFLICT.getMsg()));
+ }
+ break;
+
+ case HiveParser.TOK_GROUPBY:
+ case HiveParser.TOK_ROLLUP_GROUPBY:
+ case HiveParser.TOK_CUBE_GROUPBY:
+ case HiveParser.TOK_GROUPING_SETS:
+ // Get the groupby aliases - these are aliased to the entries in the
+ // select list
+ queryProperties.setHasGroupBy(true);
+ if (qbp.getJoinExpr() != null) {
+ queryProperties.setHasJoinFollowedByGroupBy(true);
+ }
+ if (qbp.getSelForClause(ctx_1.dest).getToken().getType() == HiveParser.TOK_SELECTDI) {
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.SELECT_DISTINCT_WITH_GROUPBY.getMsg()));
+ }
+ qbp.setGroupByExprForClause(ctx_1.dest, ast);
+ skipRecursion = true;
+
+ // Rollup and Cubes are syntactic sugar on top of grouping sets
+ if (ast.getToken().getType() == HiveParser.TOK_ROLLUP_GROUPBY) {
+ qbp.getDestRollups().add(ctx_1.dest);
+ } else if (ast.getToken().getType() == HiveParser.TOK_CUBE_GROUPBY) {
+ qbp.getDestCubes().add(ctx_1.dest);
+ } else if (ast.getToken().getType() == HiveParser.TOK_GROUPING_SETS) {
+ qbp.getDestGroupingSets().add(ctx_1.dest);
+ }
+ break;
+
+ case HiveParser.TOK_HAVING:
+ qbp.setHavingExprForClause(ctx_1.dest, ast);
+ qbp.addAggregationExprsForClause(ctx_1.dest,
+ doPhase1GetAggregationsFromSelect(ast, qb, ctx_1.dest));
+ break;
+
+ case HiveParser.KW_WINDOW:
+ if (!qb.hasWindowingSpec(ctx_1.dest) ) {
+ throw new SemanticException(generateErrorMessage(ast,
+ "Query has no Cluster/Distribute By; but has a Window definition"));
+ }
+ handleQueryWindowClauses(qb, ctx_1, ast);
+ break;
+
+ case HiveParser.TOK_LIMIT:
+ if (ast.getChildCount() == 2) {
+ qbp.setDestLimit(ctx_1.dest,
+ new Integer(ast.getChild(0).getText()),
+ new Integer(ast.getChild(1).getText()));
+ } else {
+ qbp.setDestLimit(ctx_1.dest, new Integer(0),
+ new Integer(ast.getChild(0).getText()));
+ }
+ break;
+
+ case HiveParser.TOK_ANALYZE:
+ // Case of analyze command
+
+ String table_name = getUnescapedName((ASTNode) ast.getChild(0).getChild(0)).toLowerCase();
+
+
+ qb.setTabAlias(table_name, table_name);
+ qb.addAlias(table_name);
+ qb.getParseInfo().setIsAnalyzeCommand(true);
+ qb.getParseInfo().setNoScanAnalyzeCommand(this.noscan);
+ qb.getParseInfo().setPartialScanAnalyzeCommand(this.partialscan);
+ // Allow analyze the whole table and dynamic partitions
+ HiveConf.setVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict");
+ HiveConf.setVar(conf, HiveConf.ConfVars.HIVEMAPREDMODE, "nonstrict");
+
+ break;
+
+ case HiveParser.TOK_UNIONALL:
+ if (!qbp.getIsSubQ()) {
+ // this shouldn't happen. The parser should have converted the union to be
+ // contained in a subquery. Just in case, we keep the error as a fallback.
+ throw new SemanticException(generateErrorMessage(ast,
+ ErrorMsg.UNION_NOTIN_SUBQ.getMsg()));
+ }
+ skipRecursion = false;
+ break;
+
+ case HiveParser.TOK_INSERT:
+ ASTNode destination = (ASTNode) ast.getChild(0);
+ Tree tab = destination.getChild(0);
+
+ // Proceed if AST contains partition & If Not Exists
+ if (destination.getChildCount() == 2 &&
+ tab.getChildCount() == 2 &&
+ destination.getChild(1).getType() == HiveParser.TOK_IFNOTEXISTS) {
+ String tableName = tab.getChild(0).getChild(0).getText();
+
+ Tree partitions = tab.getChild(1);
+ int childCount = partitions.getChildCount();
+ HashMap<String, String> partition = new HashMap<String, String>();
+ for (int i = 0; i < childCount; i++) {
+ String partitionName = partitions.getChild(i).getChild(0).getText();
+ Tree pvalue = partitions.getChild(i).getChild(1);
+ if (pvalue == null) {
+ break;
+ }
+ String partitionVal = stripQuotes(pvalue.getText());
+ partition.put(partitionName, partitionVal);
+ }
+ // if it is a dynamic partition throw the exception
+ if (childCount != partition.size()) {
+ throw new SemanticException(ErrorMsg.INSERT_INTO_DYNAMICPARTITION_IFNOTEXISTS
+ .getMsg(partition.toString()));
+ }
+ Table table = null;
+ try {
+ table = this.getTableObjectByName(tableName);
+ } catch (HiveException ex) {
+ throw new SemanticException(ex);
+ }
+ try {
+ Partition parMetaData = db.getPartition(table, partition, false);
+ // Check partition exists if it exists skip the overwrite
+ if (parMetaData != null) {
+ phase1Result = false;
+ skipRecursion = true;
+ LOG.info("Partition already exists so insert into overwrite " +
+ "skipped for partition : " + parMetaData.toString());
+ break;
+ }
+ } catch (HiveException e) {
+ LOG.info("Error while getting metadata : ", e);
+ }
+ validatePartSpec(table, partition, (ASTNode)tab, conf, false);
+ }
+ skipRecursion = false;
+ break;
+ case HiveParser.TOK_LATERAL_VIEW:
+ case HiveParser.TOK_LATERAL_VIEW_OUTER:
+ // todo: nested LV
+ assert ast.getChildCount() == 1;
+ qb.getParseInfo().getDestToLateralView().put(ctx_1.dest, ast);
+ break;
+ case HiveParser.TOK_CTE:
+ processCTE(qb, ast);
+ break;
+ default:
+ skipRecursion = false;
+ break;
+ }
+ }
+
+ if (!skipRecursion) {
+ // Iterate over the rest of the children
+ int child_count = ast.getChildCount();
+ for (int child_pos = 0; child_pos < child_count && phase1Result; ++child_pos) {
+ // Recurse
+ phase1Result = phase1Result && doPhase1(
+ (ASTNode)ast.getChild(child_pos), qb, ctx_1, plannerCtx);
+ }
+ }
+ return phase1Result;
+ }
+
+ /**
+ * This is phase1 of supporting specifying schema in insert statement
+ * insert into foo(z,y) select a,b from bar;
+ * @see #handleInsertStatementSpec(java.util.List, String, RowResolver, RowResolver, QB, ASTNode)
+ * @throws SemanticException
+ */
+ private void handleInsertStatementSpecPhase1(ASTNode ast, QBParseInfo qbp, Phase1Ctx ctx_1) throws SemanticException {
+ ASTNode tabColName = (ASTNode)ast.getChild(1);
+ if(ast.getType() == HiveParser.TOK_INSERT_INTO && tabColName != null && tabColName.getType() == HiveParser.TOK_TABCOLNAME) {
+ //we have "insert into foo(a,b)..."; parser will enforce that 1+ columns are listed if TOK_TABCOLNAME is present
+ List<String> targetColNames = new ArrayList<String>();
+ for(Node col : tabColName.getChildren()) {
+ assert ((ASTNode)col).getType() == HiveParser.Identifier :
+ "expected token " + HiveParser.Identifier + " found " + ((ASTNode)col).getType();
+ targetColNames.add(((ASTNode)col).getText());
+ }
+ String fullTableName = getUnescapedName((ASTNode) ast.getChild(0).getChild(0),
+ SessionState.get().getCurrentDatabase());
+ qbp.setDestSchemaForClause(ctx_1.dest, targetColNames);
+ Set<String> targetColumns = new HashSet<String>();
+ targetColumns.addAll(targetColNames);
+ if(targetColNames.size() != targetColumns.size()) {
+ throw new SemanticException(generateErrorMessage(tabColName,
+ "Duplicate column name detected in " + fullTableName + " table schema specification"));
+ }
+ Table targetTable = null;
+ try {
+ targetTable = db.getTable(fullTableName, false);
+ }
+ catch (HiveException ex) {
+ LOG.error("Error processing HiveParser.TOK_DESTINATION: " + ex.getMessage(), ex);
+ throw new SemanticException(ex);
+ }
+ if(targetTable == null) {
+ throw new SemanticException(generateErrorMessage(ast,
+ "Unable to access metadata for table " + fullTableName));
+ }
+ for(FieldSchema f : targetTable.getCols()) {
+ //parser only allows foo(a,b), not foo(foo.a, foo.b)
+ targetColumns.remove(f.getName());
+ }
+ if(!targetColumns.isEmpty()) {//here we need to see if remaining columns are dynamic partition columns
+ /* We just checked the user specified schema columns among regular table column and found some which are not
+ 'regular'. Now check is they are dynamic partition columns
+ For dynamic partitioning,
+ Given "create table multipart(a int, b int) partitioned by (c int, d int);"
+ for "insert into multipart partition(c='1',d)(d,a) values(2,3);" we expect parse tree to look like this
+ (TOK_INSERT_INTO
+ (TOK_TAB
+ (TOK_TABNAME multipart)
+ (TOK_PARTSPEC
+ (TOK_PARTVAL c '1')
+ (TOK_PARTVAL d)
+ )
+ )
+ (TOK_TABCOLNAME d a)
+ )*/
+ List<String> dynamicPartitionColumns = new ArrayList<String>();
+ if(ast.getChild(0) != null && ast.getChild(0).getType() == HiveParser.TOK_TAB) {
+ ASTNode tokTab = (ASTNode)ast.getChild(0);
+ ASTNode tokPartSpec = (ASTNode)tokTab.getFirstChildWithType(HiveParser.TOK_PARTSPEC);
+ if(tokPartSpec != null) {
+ for(Node n : tokPartSpec.getChildren()) {
+ ASTNode tokPartVal = null;
+ if(n instanceof ASTNode) {
+ tokPartVal = (ASTNode)n;
+ }
+ if(tokPartVal != null && tokPartVal.getType() == HiveParser.TOK_PARTVAL && tokPartVal.getChildCount() == 1) {
+ assert tokPartVal.getChild(0).getType() == HiveParser.Identifier :
+ "Expected column name; found tokType=" + tokPartVal.getType();
+ dynamicPartitionColumns.add(tokPartVal.getChild(0).getText());
+ }
+ }
+ }
+ }
+ for(String colName : dynamicPartitionColumns) {
+ targetColumns.remove(colName);
+ }
+ if(!targetColumns.isEmpty()) {
+ //Found some columns in user specified schema which are neither regular not dynamic partition columns
+ throw new SemanticException(generateErrorMessage(tabColName,
+ "'" + (targetColumns.size() == 1 ? targetColumns.iterator().next() : targetColumns) +
+ "' in insert schema specification " + (targetColumns.size() == 1 ? "is" : "are") +
+ " not found among regular columns of " +
+ fullTableName + " nor dynamic partition columns."));
+ }
+ }
+ }
+ }
+
+ public void getMaterializationMetadata(QB qb) throws SemanticException {
+ try {
+ gatherCTEReferences(qb, rootClause);
+ int threshold = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_CTE_MATERIALIZE_THRESHOLD);
+ for (CTEClause cte : Sets.newHashSet(aliasToCTEs.values())) {
+ if (threshold >= 0 && cte.reference >= threshold) {
+ cte.materialize = true;
+ }
+ }
+ } catch (HiveException e) {
+ // Has to use full name to make sure it does not conflict with
+ // org.apache.commons.lang.StringUtils
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ if (e instanceof SemanticException) {
+ throw (SemanticException)e;
+ }
+ throw new SemanticException(e.getMessage(), e);
+ }
+ }
+
+ private void gatherCTEReferences(QBExpr qbexpr, CTEClause parent) throws HiveException {
+ if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) {
+ gatherCTEReferences(qbexpr.getQB(), parent);
+ } else {
+ gatherCTEReferences(qbexpr.getQBExpr1(), parent);
+ gatherCTEReferences(qbexpr.getQBExpr2(), parent);
+ }
+ }
+
+ // TODO: check view references, too
+ private void gatherCTEReferences(QB qb, CTEClause current) throws HiveException {
+ for (String alias : qb.getTabAliases()) {
+ String tabName = qb.getTabNameForAlias(alias);
+ String cteName = tabName.toLowerCase();
+
+ CTEClause cte = findCTEFromName(qb, cteName);
+ if (cte != null) {
+ if (ctesExpanded.contains(cteName)) {
+ throw new SemanticException("Recursive cte " + cteName +
+ " detected (cycle: " + StringUtils.join(ctesExpanded, " -> ") +
+ " -> " + cteName + ").");
+ }
+ cte.reference++;
+ current.parents.add(cte);
+ if (cte.qbExpr != null) {
+ continue;
+ }
+ cte.qbExpr = new QBExpr(cteName);
+ doPhase1QBExpr(cte.cteNode, cte.qbExpr, qb.getId(), cteName);
+
+ ctesExpanded.add(cteName);
+ gatherCTEReferences(cte.qbExpr, cte);
+ ctesExpanded.remove(ctesExpanded.size() - 1);
+ }
+ }
+ for (String alias : qb.getSubqAliases()) {
+ gatherCTEReferences(qb.getSubqForAlias(alias), current);
+ }
+ }
+
+ public void getMetaData(QB qb) throws SemanticException {
+ getMetaData(qb, false);
+ }
+
+ public void getMetaData(QB qb, boolean enableMaterialization) throws SemanticException {
+ try {
+ if (enableMaterialization) {
+ getMaterializationMetadata(qb);
+ }
+ getMetaData(qb, null);
+ } catch (HiveException e) {
+ // Has to use full name to make sure it does not conflict with
+ // org.apache.commons.lang.StringUtils
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ if (e instanceof SemanticException) {
+ throw (SemanticException)e;
+ }
+ throw new SemanticException(e.getMessage(), e);
+ }
+ }
+
+ private void getMetaData(QBExpr qbexpr, ReadEntity parentInput)
+ throws HiveException {
+ if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) {
+ getMetaData(qbexpr.getQB(), parentInput);
+ } else {
+ getMetaData(qbexpr.getQBExpr1(), parentInput);
+ getMetaData(qbexpr.getQBExpr2(), parentInput);
+ }
+ }
+
+ @SuppressWarnings("nls")
+ private void getMetaData(QB qb, ReadEntity parentInput)
+ throws HiveException {
+ LOG.info("Get metadata for source tables");
+
+ // Go over the tables and populate the related structures.
+ // We have to materialize the table alias list since we might
+ // modify it in the middle for view rewrite.
+ List<String> tabAliases = new ArrayList<String>(qb.getTabAliases());
+
+ // Keep track of view alias to view name and read entity
+ // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T
+ // keeps track of full view name and read entity corresponding to alias V3, V3:V2, V3:V2:V1.
+ // This is needed for tracking the dependencies for inputs, along with their parents.
+ Map<String, ObjectPair<String, ReadEntity>> aliasToViewInfo =
+ new HashMap<String, ObjectPair<String, ReadEntity>>();
+
+ /*
+ * used to capture view to SQ conversions. This is used to check for
+ * recursive CTE invocations.
+ */
+ Map<String, String> sqAliasToCTEName = new HashMap<String, String>();
+
+ for (String alias : tabAliases) {
+ String tabName = qb.getTabNameForAlias(alias);
+ String cteName = tabName.toLowerCase();
+
+ Table tab = db.getTable(tabName, false);
+ if (tab == null ||
+ tab.getDbName().equals(SessionState.get().getCurrentDatabase())) {
+ Table materializedTab = ctx.getMaterializedTable(cteName);
+ if (materializedTab == null) {
+ // we first look for this alias from CTE, and then from catalog.
+ CTEClause cte = findCTEFromName(qb, cteName);
+ if (cte != null) {
+ if (!cte.materialize) {
+ addCTEAsSubQuery(qb, cteName, alias);
+ sqAliasToCTEName.put(alias, cteName);
+ continue;
+ }
+ tab = materializeCTE(cteName, cte);
+ }
+ } else {
+ tab = materializedTab;
+ }
+ }
+
+ if (tab == null) {
+ ASTNode src = qb.getParseInfo().getSrcForAlias(alias);
+ if (null != src) {
+ throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(src));
+ } else {
+ throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(alias));
+ }
+ }
+ if (tab.isView()) {
+ if (qb.getParseInfo().isAnalyzeCommand()) {
+ throw new SemanticException(ErrorMsg.ANALYZE_VIEW.getMsg());
+ }
+ String fullViewName = tab.getDbName() + "." + tab.getTableName();
+ // Prevent view cycles
+ if (viewsExpanded.contains(fullViewName)) {
+ throw new SemanticException("Recursive view " + fullViewName +
+ " detected (cycle: " + StringUtils.join(viewsExpanded, " -> ") +
+ " -> " + fullViewName + ").");
+ }
+ replaceViewReferenceWithDefinition(qb, tab, tabName, alias);
+ // This is the last time we'll see the Table objects for views, so add it to the inputs
+ // now. isInsideView will tell if this view is embedded in another view.
+ // If the view is Inside another view, it should have at least one parent
+ if (qb.isInsideView() && parentInput == null) {
+ parentInput = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput);
+ }
+ ReadEntity viewInput = new ReadEntity(tab, parentInput, !qb.isInsideView());
+ viewInput = PlanUtils.addInput(inputs, viewInput);
+ aliasToViewInfo.put(alias, new ObjectPair<String, ReadEntity>(fullViewName, viewInput));
+ String aliasId = getAliasId(alias, qb);
+ if (aliasId != null) {
+ aliasId = aliasId.replace(SemanticAnalyzer.SUBQUERY_TAG_1, "")
+ .replace(SemanticAnalyzer.SUBQUERY_TAG_2, "");
+ }
+ viewAliasToInput.put(aliasId, viewInput);
+ continue;
+ }
+
+ if (!InputFormat.class.isAssignableFrom(tab.getInputFormatClass())) {
+ throw new SemanticException(generateErrorMessage(
+ qb.getParseInfo().getSrcForAlias(alias),
+ ErrorMsg.INVALID_INPUT_FORMAT_TYPE.getMsg()));
+ }
+
+ qb.getMetaData().setSrcForAlias(alias, tab);
+
+ if (qb.getParseInfo().isAnalyzeCommand()) {
+ // allow partial partition specification for nonscan since noscan is fast.
+ TableSpec ts = new TableSpec(db, conf, (ASTNode) ast.getChild(0), true, this.noscan);
+ if (ts.specType == SpecType.DYNAMIC_PARTITION) { // dynamic partitions
+ try {
+ ts.partitions = db.getPartitionsByNames(ts.tableHandle, ts.partSpec);
+ } catch (HiveException e) {
+ throw new SemanticException(generateErrorMessage(
+ qb.getParseInfo().getSrcForAlias(alias),
+ "Cannot get partitions for " + ts.partSpec), e);
+ }
+ }
+ // validate partial scan command
+ QBParseInfo qbpi = qb.getParseInfo();
+ if (qbpi.isPartialScanAnalyzeCommand()) {
+ Class<? extends InputFormat> inputFormatClass = null;
+ switch (ts.specType) {
+ case TABLE_ONLY:
+ case DYNAMIC_PARTITION:
+ inputFormatClass = ts.tableHandle.getInputFormatClass();
+ break;
+ case STATIC_PARTITION:
+ inputFormatClass = ts.partHandle.getInputFormatClass();
+ break;
+ default:
+ assert false;
+ }
+ // throw a HiveException for formats other than rcfile or orcfile.
+ if (!(inputFormatClass.equals(RCFileInputFormat.class) || inputFormatClass
+ .equals(OrcInputFormat.class))) {
+ throw new SemanticException(ErrorMsg.ANALYZE_TABLE_PARTIALSCAN_NON_RCFILE.getMsg());
+ }
+ }
+
+ tab.setTableSpec(ts);
+ qb.getParseInfo().addTableSpec(alias, ts);
+ }
+
+ ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput);
+ // Temporary tables created during the execution are not the input sources
+ if (!PlanUtils.isValuesTempTable(alias)) {
+ PlanUtils.addInput(inputs,
+ new ReadEntity(tab, parentViewInfo, parentViewInfo == null),mergeIsDirect);
+ }
+ }
+
+ LOG.info("Get metadata for subqueries");
+ // Go over the subqueries and getMetaData for these
+ for (String alias : qb.getSubqAliases()) {
+ boolean wasView = aliasToViewInfo.containsKey(alias);
+ boolean wasCTE = sqAliasToCTEName.containsKey(alias);
+ ReadEntity newParentInput = null;
+ if (wasView) {
+ viewsExpanded.add(aliasToViewInfo.get(alias).getFirst());
+ newParentInput = aliasToViewInfo.get(alias).getSecond();
+ } else if (wasCTE) {
+ ctesExpanded.add(sqAliasToCTEName.get(alias));
+ }
<TRUNCATED>
[3/5] hive git commit: HIVE-16423: Add hint to enforce semi join
optimization (Deepak Jaiswal, reviewed by Jason Dere)
Posted by gu...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
index d58f447..83e89af 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
@@ -266,11 +266,14 @@ public class GenTezUtils {
}
}
// This TableScanOperator could be part of semijoin optimization.
- Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap =
- context.parseContext.getRsOpToTsOpMap();
- for (ReduceSinkOperator rs : rsOpToTsOpMap.keySet()) {
- if (rsOpToTsOpMap.get(rs) == orig) {
- rsOpToTsOpMap.put(rs, (TableScanOperator) newRoot);
+ Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo =
+ context.parseContext.getRsToSemiJoinBranchInfo();
+ for (ReduceSinkOperator rs : rsToSemiJoinBranchInfo.keySet()) {
+ SemiJoinBranchInfo sjInfo = rsToSemiJoinBranchInfo.get(rs);
+ if (sjInfo.getTsOp() == orig) {
+ SemiJoinBranchInfo newSJInfo = new SemiJoinBranchInfo(
+ (TableScanOperator)newRoot, sjInfo.getIsHint());
+ rsToSemiJoinBranchInfo.put(rs, newSJInfo);
}
}
}
@@ -516,19 +519,18 @@ public class GenTezUtils {
return EdgeType.SIMPLE_EDGE;
}
- public static void processDynamicMinMaxPushDownOperator(
+ public static void processDynamicSemiJoinPushDownOperator(
GenTezProcContext procCtx, RuntimeValuesInfo runtimeValuesInfo,
ReduceSinkOperator rs)
throws SemanticException {
- TableScanOperator ts = procCtx.parseContext.getRsOpToTsOpMap().get(rs);
+ SemiJoinBranchInfo sjInfo = procCtx.parseContext.getRsToSemiJoinBranchInfo().get(rs);
List<BaseWork> rsWorkList = procCtx.childToWorkMap.get(rs);
- if (ts == null || rsWorkList == null) {
+ if (sjInfo == null || rsWorkList == null) {
// This happens when the ReduceSink's edge has been removed by cycle
// detection logic. Nothing to do here.
return;
}
- LOG.debug("ResduceSink " + rs + " to TableScan " + ts);
if (rsWorkList.size() != 1) {
StringBuilder sb = new StringBuilder();
@@ -541,6 +543,9 @@ public class GenTezUtils {
throw new SemanticException(rs + " belongs to multiple BaseWorks: " + sb.toString());
}
+ TableScanOperator ts = sjInfo.getTsOp();
+ LOG.debug("ResduceSink " + rs + " to TableScan " + ts);
+
BaseWork parentWork = rsWorkList.get(0);
BaseWork childWork = procCtx.rootToWorkMap.get(ts);
@@ -611,7 +616,7 @@ public class GenTezUtils {
skip = true;
}
}
- context.getRsOpToTsOpMap().remove(rs);
+ context.getRsToSemiJoinBranchInfo().remove(rs);
}
private static class DynamicValuePredicateContext implements NodeProcessorCtx {
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
index 8e70a46..e110fb3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
@@ -31,6 +31,7 @@ tokens {
TOK_MAPJOIN;
TOK_STREAMTABLE;
TOK_HINTARGLIST;
+ TOK_LEFTSEMIJOIN;
}
@header {
@@ -69,6 +70,7 @@ hintItem
hintName
:
KW_MAPJOIN -> TOK_MAPJOIN
+ | KW_SEMI -> TOK_LEFTSEMIJOIN
| KW_STREAMTABLE -> TOK_STREAMTABLE
;
@@ -80,4 +82,5 @@ hintArgs
hintArgName
:
Identifier
+ | Number
;
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
index 3f9f76c..9a69f90 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
@@ -33,17 +33,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.QueryProperties;
import org.apache.hadoop.hive.ql.QueryState;
-import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.FetchTask;
-import org.apache.hadoop.hive.ql.exec.JoinOperator;
-import org.apache.hadoop.hive.ql.exec.ListSinkOperator;
-import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.Operator;
-import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
-import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.SelectOperator;
-import org.apache.hadoop.hive.ql.exec.TableScanOperator;
-import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.*;
import org.apache.hadoop.hive.ql.hooks.LineageInfo;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.metadata.Table;
@@ -126,11 +116,10 @@ public class ParseContext {
private boolean needViewColumnAuthorization;
private Set<FileSinkDesc> acidFileSinks = Collections.emptySet();
- // Map to store mapping between reduce sink Operator and TS Operator for semijoin
- private Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap =
- new HashMap<ReduceSinkOperator, TableScanOperator>();
private Map<ReduceSinkOperator, RuntimeValuesInfo> rsToRuntimeValuesInfo =
new HashMap<ReduceSinkOperator, RuntimeValuesInfo>();
+ private Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo =
+ new HashMap<>();
public ParseContext() {
}
@@ -666,11 +655,11 @@ public class ParseContext {
return rsToRuntimeValuesInfo;
}
- public void setRsOpToTsOpMap(Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap) {
- this.rsOpToTsOpMap = rsOpToTsOpMap;
+ public void setRsToSemiJoinBranchInfo(Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo) {
+ this.rsToSemiJoinBranchInfo = rsToSemiJoinBranchInfo;
}
- public Map<ReduceSinkOperator, TableScanOperator> getRsOpToTsOpMap() {
- return rsOpToTsOpMap;
+ public Map<ReduceSinkOperator, SemiJoinBranchInfo> getRsToSemiJoinBranchInfo() {
+ return rsToSemiJoinBranchInfo;
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
index ec76fb7..bcef252 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.hive.ql.parse;
+import java.util.Arrays;
+
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
@@ -82,6 +84,7 @@ public class QBJoinTree implements Serializable, Cloneable {
* We then add a Filter Operator after the Join Operator for this QBJoinTree.
*/
private final List<ASTNode> postJoinFilters;
+ private Map<String, SemiJoinHint> semiJoinHint;
/**
* constructor.
@@ -429,4 +432,17 @@ public class QBJoinTree implements Serializable, Cloneable {
return cloned;
}
+
+ public void setSemiJoinHint(Map<String, SemiJoinHint> semiJoinHint) {
+ this.semiJoinHint = semiJoinHint;
+ }
+
+ public Map<String, SemiJoinHint> getSemiJoinHint() {
+ return semiJoinHint;
+ }
+
+ @Override
+ public String toString() {
+ return "QBJoinTree [leftAlias=" + leftAlias + ", rightAliases=" + Arrays.toString(rightAliases) + ", leftAliases=" + Arrays.toString(leftAliases) + ", semiJoinHint=" + semiJoinHint + "]";
+ }
}
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index b5a5645..e4ca25b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -8122,6 +8122,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames,
join.getNoOuterJoin(), joinCondns, filterMap, joinKeys);
+ desc.setSemiJoinHints(join.getSemiJoinHint());
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(join.getFilterMap());
// For outer joins, add filters that apply to more than one input
@@ -8669,6 +8670,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
parseStreamTables(joinTree, qb);
}
+ if (qb.getParseInfo().getHints() != null) {
+ // TODO: do we need this for unique join?
+ joinTree.setSemiJoinHint(parseSemiJoinHint(qb.getParseInfo().getHints()));
+ }
return joinTree;
}
@@ -8967,6 +8972,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
if ((conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) == false) {
parseStreamTables(joinTree, qb);
}
+
+ joinTree.setSemiJoinHint(parseSemiJoinHint(qb.getParseInfo().getHints()));
}
return joinTree;
@@ -9014,6 +9021,62 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
joinTree.setStreamAliases(streamAliases);
}
+ /** Parses semjoin hints in the query and returns the table names mapped to filter size, or -1 if not specified.
+ * Hints can be in 3 formats
+ * 1. TableName, ColumnName, bloom filter entries
+ * 2. TableName, bloom filter entries, and
+ * 3. TableName, ColumnName
+ * */
+ public Map<String, SemiJoinHint> parseSemiJoinHint(ASTNode hints) throws SemanticException {
+ if (hints == null) return null;
+ Map<String, SemiJoinHint> result = null;
+ for (Node hintNode : hints.getChildren()) {
+ ASTNode hint = (ASTNode) hintNode;
+ if (hint.getChild(0).getType() != HintParser.TOK_LEFTSEMIJOIN) continue;
+ if (result == null) {
+ result = new HashMap<>();
+ }
+ String alias = null;
+ String colName = null;
+ Tree args = hint.getChild(1);
+ for (int i = 0; i < args.getChildCount(); i++) {
+ // We can have table names, column names or sizes here (or incorrect hint if the user is so inclined).
+ String text = args.getChild(i).getText();
+ Integer number = null;
+ try {
+ number = Integer.parseInt(text);
+ } catch (NumberFormatException ex) { // Ignore.
+ }
+ if (number != null) {
+ if (alias == null) {
+ throw new SemanticException("Invalid semijoin hint - arg " + i + " ("
+ + text + ") is a number but the previous one is not an alias");
+ }
+ SemiJoinHint sjHint = new SemiJoinHint(alias, colName, number);
+ result.put(alias, sjHint);
+ alias = null;
+ colName = null;
+ } else {
+ if (alias == null) {
+ alias = text;
+ } else if (colName == null ){
+ colName = text;
+ } else {
+ // No bloom filter entries provided.
+ SemiJoinHint sjHint = new SemiJoinHint(alias, colName, null);
+ result.put(alias, sjHint);
+ alias = text;
+ colName = null;
+ }
+ }
+ }
+ }
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Semijoin hint parsed: " + result);
+ }
+ return result;
+ }
+
/**
* Merges node to target
*/