You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by gu...@apache.org on 2017/04/20 17:18:52 UTC

[1/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

Repository: hive
Updated Branches:
  refs/heads/master fa24d4b9b -> 9d5d737db


http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
new file mode 100644
index 0000000..5d7b9e5
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+
+public class SemiJoinBranchInfo {
+  private TableScanOperator ts;
+  private boolean isHint;
+
+  public SemiJoinBranchInfo(TableScanOperator ts) {
+    this.ts = ts;
+    isHint = false;
+  }
+
+  public SemiJoinBranchInfo(TableScanOperator ts, boolean isHint) {
+    this.ts = ts;
+    this.isHint = isHint;
+  }
+
+  public TableScanOperator getTsOp() {
+    return ts;
+  }
+
+  public boolean getIsHint() {
+    return isHint;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
new file mode 100644
index 0000000..1f24e23
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+public class SemiJoinHint {
+  private String tabAlias;
+  private String colName;
+  private Integer numEntries;
+
+  public SemiJoinHint(String tabAlias, String colName, Integer numEntries) {
+    this.tabAlias = tabAlias;
+    this.colName = colName;
+    this.numEntries = numEntries;
+  }
+
+  public String getTabAlias() {
+    return tabAlias;
+  }
+
+  public String getColName() {
+    return colName;
+  }
+
+  public Integer getNumEntries() {
+    return numEntries != null ? numEntries : -1;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
index 7caeb78..96525b4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
@@ -531,7 +531,7 @@ public abstract class TaskCompiler {
     clone.setLineageInfo(pCtx.getLineageInfo());
     clone.setMapJoinOps(pCtx.getMapJoinOps());
     clone.setRsToRuntimeValuesInfoMap(pCtx.getRsToRuntimeValuesInfoMap());
-    clone.setRsOpToTsOpMap(pCtx.getRsOpToTsOpMap());
+    clone.setRsToSemiJoinBranchInfo(pCtx.getRsToSemiJoinBranchInfo());
 
     return clone;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index eaad988..26eda04 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -178,6 +178,9 @@ public class TezCompiler extends TaskCompiler {
     TableScanOperator victimTS = null;
     ReduceSinkOperator victimRS = null;
 
+    // If there is a hint and no operator is removed then throw error
+    boolean hasHint = false;
+    boolean removed = false;
     for (Operator<?> o : component) {
       // Look for AppMasterEventOperator or ReduceSinkOperator
       if (o instanceof AppMasterEventOperator) {
@@ -185,25 +188,34 @@ public class TezCompiler extends TaskCompiler {
                 || o.getStatistics().getDataSize() < victimAM.getStatistics()
                 .getDataSize()) {
           victimAM = (AppMasterEventOperator) o;
+          removed = true;
         }
       } else if (o instanceof ReduceSinkOperator) {
-        TableScanOperator ts = context.parseContext.getRsOpToTsOpMap().get(o);
-        if (ts == null) {
+
+        SemiJoinBranchInfo sjInfo =
+                context.parseContext.getRsToSemiJoinBranchInfo().get(o);
+        if (sjInfo == null ) continue;
+        if (sjInfo.getIsHint()) {
+          // Skipping because of hint. Mark this info,
+          hasHint = true;
           continue;
         }
+
+        TableScanOperator ts = sjInfo.getTsOp();
         // Sanity check
         assert component.contains(ts);
 
         if (victimRS == null ||
                 ts.getStatistics().getDataSize() <
-                victimTS.getStatistics().getDataSize()) {
-            victimRS = (ReduceSinkOperator) o;
-            victimTS = ts;
-          }
+                        victimTS.getStatistics().getDataSize()) {
+          victimRS = (ReduceSinkOperator) o;
+          victimTS = ts;
+          removed = true;
         }
       }
+    }
 
-    // Always set the min/max optimization as victim.
+    // Always set the semijoin optimization as victim.
     Operator<?> victim = victimRS;
 
     if (victimRS == null && victimAM != null ) {
@@ -227,6 +239,11 @@ public class TezCompiler extends TaskCompiler {
       }
     }
 
+    if (hasHint && !removed) {
+      // There is hint but none of the operators removed. Throw error
+      throw new SemanticException("The user hint is causing an operator cycle. Please fix it and retry");
+    }
+
     if (victim == null ||
             (!context.pruningOpsRemovedByPriorOpt.isEmpty() &&
                     context.pruningOpsRemovedByPriorOpt.contains(victim))) {
@@ -287,11 +304,12 @@ public class TezCompiler extends TaskCompiler {
       LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
       children.add(ts);
     } else if (o instanceof ReduceSinkOperator){
-      // min/max case
+      // semijoin case
       children = new ArrayList<Operator<?>>();
       children.addAll(o.getChildOperators());
-      TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(o);
-      if (ts != null) {
+      SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
+      if (sjInfo != null ) {
+        TableScanOperator ts = sjInfo.getTsOp();
         LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
         children.add(ts);
       }
@@ -460,7 +478,7 @@ public class TezCompiler extends TaskCompiler {
     if (pCtx.getRsToRuntimeValuesInfoMap().size() > 0) {
       for (ReduceSinkOperator rs : pCtx.getRsToRuntimeValuesInfoMap().keySet()) {
         // Process min/max
-        GenTezUtils.processDynamicMinMaxPushDownOperator(
+        GenTezUtils.processDynamicSemiJoinPushDownOperator(
                 procCtx, pCtx.getRsToRuntimeValuesInfoMap().get(rs), rs);
       }
     }
@@ -617,7 +635,7 @@ public class TezCompiler extends TaskCompiler {
   private static void removeSemijoinOptimizationFromSMBJoins(
           OptimizeTezProcContext procCtx) throws SemanticException {
     if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) ||
-            procCtx.parseContext.getRsOpToTsOpMap().size() == 0) {
+            procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
       return;
     }
 
@@ -636,9 +654,9 @@ public class TezCompiler extends TaskCompiler {
     GraphWalker ogw = new PreOrderOnceWalker(disp);
     ogw.startWalking(topNodes, null);
 
+    List<TableScanOperator> tsOps = new ArrayList<>();
     // Iterate over the map and remove semijoin optimizations if needed.
     for (CommonMergeJoinOperator joinOp : ctx.JoinOpToTsOpMap.keySet()) {
-      List<TableScanOperator> tsOps = new ArrayList<TableScanOperator>();
       // Get one top level TS Op directly from the stack
       tsOps.add(ctx.JoinOpToTsOpMap.get(joinOp));
 
@@ -651,7 +669,7 @@ public class TezCompiler extends TaskCompiler {
         }
 
         assert parent instanceof SelectOperator;
-        while(parent != null) {
+        while (parent != null) {
           if (parent instanceof TableScanOperator) {
             tsOps.add((TableScanOperator) parent);
             break;
@@ -659,20 +677,24 @@ public class TezCompiler extends TaskCompiler {
           parent = parent.getParentOperators().get(0);
         }
       }
-
-      // Now the relevant TableScanOperators are known, find if there exists
-      // a semijoin filter on any of them, if so, remove it.
-      ParseContext pctx = procCtx.parseContext;
-      for (TableScanOperator ts : tsOps) {
-        for (ReduceSinkOperator rs : pctx.getRsOpToTsOpMap().keySet()) {
-          if (ts == pctx.getRsOpToTsOpMap().get(rs)) {
-            // match!
-            if (LOG.isDebugEnabled()) {
-              LOG.debug("Semijoin optimization found going to SMB join. Removing semijoin "
-                  + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
-            }
-            GenTezUtils.removeBranch(rs);
-            GenTezUtils.removeSemiJoinOperator(pctx, rs, ts);
+    }
+    // Now the relevant TableScanOperators are known, find if there exists
+    // a semijoin filter on any of them, if so, remove it.
+
+    ParseContext pctx = procCtx.parseContext;
+    for (TableScanOperator ts : tsOps) {
+      for (ReduceSinkOperator rs : pctx.getRsToSemiJoinBranchInfo().keySet()) {
+        SemiJoinBranchInfo sjInfo = pctx.getRsToSemiJoinBranchInfo().get(rs);
+        if (ts == sjInfo.getTsOp()) {
+          // match!
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Semijoin optimization found going to SMB join. Removing semijoin "
+                    + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+          }
+          GenTezUtils.removeBranch(rs);
+          GenTezUtils.removeSemiJoinOperator(pctx, rs, ts);
+          if (sjInfo.getIsHint()) {
+            LOG.debug("Removing hinted semijoin as it is with SMB join " + rs + " : " + ts);
           }
         }
       }
@@ -699,7 +721,7 @@ public class TezCompiler extends TaskCompiler {
   private static void removeSemiJoinCyclesDueToMapsideJoins(
           OptimizeTezProcContext procCtx) throws SemanticException {
     if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) ||
-            procCtx.parseContext.getRsOpToTsOpMap().size() == 0) {
+            procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
       return;
     }
 
@@ -752,10 +774,10 @@ public class TezCompiler extends TaskCompiler {
         }
 
         ReduceSinkOperator rs = ((ReduceSinkOperator) child);
-        TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs);
-        if (ts == null) {
-          continue;
-        }
+        SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
+        if (sjInfo == null) continue;
+
+        TableScanOperator ts = sjInfo.getTsOp();
         // This is a semijoin branch. Find if this is creating a potential
         // cycle with childJoin.
         for (Operator<?> parent : childJoin.getParentOperators()) {
@@ -776,6 +798,9 @@ public class TezCompiler extends TaskCompiler {
             }
             GenTezUtils.removeBranch(rs);
             GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
+            if (sjInfo.getIsHint()) {
+              LOG.debug("Removing hinted semijoin as it is creating cycles with mapside joins " + rs + " : " + ts);
+            }
           }
         }
       }
@@ -790,8 +815,8 @@ public class TezCompiler extends TaskCompiler {
       assert nd instanceof ReduceSinkOperator;
       ReduceSinkOperator rs = (ReduceSinkOperator) nd;
       ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext;
-      TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs);
-      if (ts == null) {
+      SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
+      if (sjInfo == null) {
         // nothing to do here.
         return null;
       }
@@ -802,6 +827,7 @@ public class TezCompiler extends TaskCompiler {
       GroupByDesc gbDesc = gbOp.getConf();
       ArrayList<AggregationDesc> aggregationDescs = gbDesc.getAggregators();
       boolean removeSemiJoin = false;
+      TableScanOperator ts = sjInfo.getTsOp();
       for (AggregationDesc agg : aggregationDescs) {
         if (agg.getGenericUDAFName() != "bloom_filter") {
           continue;
@@ -809,20 +835,24 @@ public class TezCompiler extends TaskCompiler {
 
         GenericUDAFBloomFilterEvaluator udafBloomFilterEvaluator =
                 (GenericUDAFBloomFilterEvaluator) agg.getGenericUDAFEvaluator();
+        if (udafBloomFilterEvaluator.hasHintEntries())
+          return null; // Created using hint, skip it
+
         long expectedEntries = udafBloomFilterEvaluator.getExpectedEntries();
         if (expectedEntries == -1 || expectedEntries >
                 pCtx.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)) {
           removeSemiJoin = true;
           if (LOG.isDebugEnabled()) {
             LOG.debug("expectedEntries=" + expectedEntries + ". "
-                + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
-                + "Removing semijoin "
-                + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+                    + "Either stats unavailable or expectedEntries exceeded max allowable bloomfilter size. "
+                    + "Removing semijoin "
+                    + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
           }
           break;
         }
       }
 
+      // At this point, hinted semijoin case has been handled already
       // Check if big table is big enough that runtime filtering is
       // worth it.
       if (ts.getStatistics() != null) {
@@ -831,16 +861,16 @@ public class TezCompiler extends TaskCompiler {
           removeSemiJoin = true;
           if (LOG.isDebugEnabled()) {
             LOG.debug("Insufficient rows (" + numRows + ") to justify semijoin optimization. Removing semijoin "
-                + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
+                    + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts));
           }
         }
       }
-
       if (removeSemiJoin) {
         // The stats are not annotated, remove the semijoin operator
         GenTezUtils.removeBranch(rs);
         GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
       }
+
       return null;
     }
   }
@@ -905,15 +935,23 @@ public class TezCompiler extends TaskCompiler {
           }
 
           ReduceSinkOperator rs = (ReduceSinkOperator) child;
-          TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(rs);
-          if (ts == null || ts != bigTableTS) {
-            // skip, no semijoin or not the one we are looking for.
+          SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(rs);
+          if (sjInfo == null) continue;
+
+          TableScanOperator ts = sjInfo.getTsOp();
+          if (ts != bigTableTS) {
+            // skip, not the one we are looking for.
             continue;
           }
 
+          parallelEdges = true;
+
+          if (sjInfo.getIsHint()) {
+            // Created by hint, skip it
+            continue;
+          }
           // Add the semijoin branch to the map
           semijoins.put(rs, ts);
-          parallelEdges = true;
         }
       }
     }
@@ -1141,10 +1179,15 @@ public class TezCompiler extends TaskCompiler {
     }
 
     List<ReduceSinkOperator> semijoinRsToRemove = new ArrayList<ReduceSinkOperator>();
-    Map<ReduceSinkOperator, TableScanOperator> map = procCtx.parseContext.getRsOpToTsOpMap();
+    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
     double semijoinReductionThreshold = procCtx.conf.getFloatVar(
         HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
     for (ReduceSinkOperator rs : map.keySet()) {
+      SemiJoinBranchInfo sjInfo = map.get(rs);
+      if (sjInfo.getIsHint()) {
+        // Semijoin created using hint, skip it
+        continue;
+      }
       // rs is semijoin optimization branch, which should look like <Parent>-SEL-GB1-RS1-GB2-RS2
       // Get to the SelectOperator ancestor
       SelectOperator sel = null;
@@ -1159,7 +1202,7 @@ public class TezCompiler extends TaskCompiler {
       }
 
       // Check the ndv/rows from the SEL vs the destination tablescan the semijoin opt is going to.
-      TableScanOperator ts = map.get(rs);
+      TableScanOperator ts = sjInfo.getTsOp();
       RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
       ExprNodeDesc tsExpr = rti.getTsColExpr();
       // In the SEL operator of the semijoin branch, there should be only one column in the operator
@@ -1179,7 +1222,7 @@ public class TezCompiler extends TaskCompiler {
     }
 
     for (ReduceSinkOperator rs : semijoinRsToRemove) {
-      TableScanOperator ts = map.get(rs);
+      TableScanOperator ts = map.get(rs).getTsOp();
       if (LOG.isDebugEnabled()) {
         LOG.debug("Reduction factor not satisfied for " + OperatorUtils.getOpNamePretty(rs)
             + "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin optimization.");

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
index 18e4fbd..3143554 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
@@ -18,7 +18,10 @@
 
 package org.apache.hadoop.hive.ql.plan;
 
+import java.util.Map;
+
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
 
 /**
@@ -29,14 +32,17 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
 
   Operator<? extends OperatorDesc> source;
   int keyIndex;
+  Map<String, SemiJoinHint> hints;
 
   public ExprNodeDynamicListDesc() {
   }
 
-  public ExprNodeDynamicListDesc(TypeInfo typeInfo, Operator<? extends OperatorDesc> source, int keyIndex) {
+  public ExprNodeDynamicListDesc(TypeInfo typeInfo, Operator<? extends OperatorDesc> source,
+      int keyIndex, Map<String, SemiJoinHint> hints) {
     super(typeInfo);
     this.source = source;
     this.keyIndex = keyIndex;
+    this.hints = hints;
   }
 
   public void setSource(Operator<? extends OperatorDesc> source) {
@@ -57,8 +63,7 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
 
   @Override
   public ExprNodeDesc clone() {
-    ExprNodeDynamicListDesc clone = new ExprNodeDynamicListDesc(typeInfo, source, keyIndex);
-    return clone;
+    return new ExprNodeDynamicListDesc(typeInfo, source, keyIndex, hints);
   }
 
   @Override
@@ -78,4 +83,8 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
   public String toString() {
     return source.toString();
   }
+
+  public Map<String, SemiJoinHint> getHints() {
+    return hints;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
index bcf3691..032c7bb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
@@ -29,6 +29,7 @@ import java.util.Map;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
 import org.apache.hadoop.hive.ql.plan.Explain.Level;
 
 
@@ -106,6 +107,10 @@ public class JoinDesc extends AbstractOperatorDesc {
   private transient Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo;
   private transient boolean leftInputJoin;
   private transient List<String> streamAliases;
+  // Note: there are two things in Hive called semi-joins - the left semi join construct,
+  //       and also a bloom-filter based optimization that came later. This is for the latter.
+  //       Everything else in this desc that says "semi-join" is for the former.
+  private transient Map<String, SemiJoinHint> semiJoinHints;
 
   public JoinDesc() {
   }
@@ -197,6 +202,7 @@ public class JoinDesc extends AbstractOperatorDesc {
     this.filterMap = clone.filterMap;
     this.residualFilterExprs = clone.residualFilterExprs;
     this.statistics = clone.statistics;
+    this.semiJoinHints = clone.semiJoinHints;
   }
 
   public Map<Byte, List<ExprNodeDesc>> getExprs() {
@@ -682,4 +688,16 @@ public class JoinDesc extends AbstractOperatorDesc {
     streamAliases = joinDesc.streamAliases == null ? null : new ArrayList<String>(joinDesc.streamAliases);
   }
 
+  private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(JoinDesc.class);
+  public void setSemiJoinHints(Map<String, SemiJoinHint> semiJoinHints) {
+    if (semiJoinHints != null || this.semiJoinHints != null) {
+      LOG.debug("Setting semi-join hints to " + semiJoinHints);
+    }
+    this.semiJoinHints = semiJoinHints;
+  }
+
+  public Map<String, SemiJoinHint> getSemiJoinHints() {
+    return semiJoinHints;
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
index 71c7310..f45daa8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
@@ -26,6 +26,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.Stack;
 
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -134,13 +135,12 @@ public class SyntheticJoinPredicate extends Transform {
     public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
         Object... nodeOutputs) throws SemanticException {
 
-      ParseContext pCtx = ((SyntheticContext) procCtx).getParseContext();
-
       @SuppressWarnings("unchecked")
       CommonJoinOperator<JoinDesc> join = (CommonJoinOperator<JoinDesc>) nd;
 
       ReduceSinkOperator source = (ReduceSinkOperator) stack.get(stack.size() - 2);
       int srcPos = join.getParentOperators().indexOf(source);
+      Map<String, SemiJoinHint> hints = join.getConf().getSemiJoinHints();
 
       List<Operator<? extends OperatorDesc>> parents = join.getParentOperators();
 
@@ -181,7 +181,7 @@ public class SyntheticJoinPredicate extends Transform {
           inArgs.add(sourceKeys.get(i));
 
           ExprNodeDynamicListDesc dynamicExpr =
-              new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), target, i);
+              new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), target, i, hints);
 
           inArgs.add(dynamicExpr);
 

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
index 2b84beb..2413ae6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
@@ -72,6 +72,7 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 {
   public static class GenericUDAFBloomFilterEvaluator extends GenericUDAFEvaluator {
     // Source operator to get the number of entries
     private SelectOperator sourceOperator;
+    private long hintEntries = -1;
     private long maxEntries = 0;
     private long minEntries = 0;
     private float factor = 1;
@@ -254,6 +255,10 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 {
     }
 
     public long getExpectedEntries() {
+      // If hint is provided use that size.
+      if (hintEntries > 0 )
+        return hintEntries;
+
       long expectedEntries = -1;
       if (sourceOperator != null && sourceOperator.getStatistics() != null) {
         Statistics stats = sourceOperator.getStatistics();
@@ -294,6 +299,14 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 {
       this.sourceOperator = sourceOperator;
     }
 
+    public void setHintEntries(long hintEntries) {
+      this.hintEntries = hintEntries;
+    }
+
+    public boolean hasHintEntries() {
+      return hintEntries != -1;
+    }
+
     public void setMaxEntries(long maxEntries) {
       this.maxEntries = maxEntries;
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/test/queries/clientpositive/semijoin_hint.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/semijoin_hint.q b/ql/src/test/queries/clientpositive/semijoin_hint.q
new file mode 100644
index 0000000..5de0c8c
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/semijoin_hint.q
@@ -0,0 +1,54 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.cbo.enable=true;
+set hive.compute.query.using.stats=false;
+set hive.mapred.mode=nonstrict;
+set hive.optimize.ppd=true;
+set hive.ppd.remove.duplicatefilters=true;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.optimize.metadataonly=false;
+set hive.optimize.index.filter=true;
+set hive.stats.autogather=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.dynamic.semijoin.reduction.threshold=-999999999999;
+
+-- Create Tables
+create table alltypesorc_int ( cint int, cstring string ) stored as ORC;
+create table srcpart_date (str string, value string) partitioned by (ds string ) stored as ORC;
+CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC;
+
+-- Add Partitions
+alter table srcpart_date add partition (ds = "2008-04-08");
+alter table srcpart_date add partition (ds = "2008-04-09");
+
+alter table srcpart_small add partition (ds = "2008-04-08");
+alter table srcpart_small add partition (ds = "2008-04-09");
+
+-- Load
+insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc;
+insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08";
+insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09";
+insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09";
+analyze table alltypesorc_int compute statistics for columns;
+analyze table srcpart_date compute statistics for columns;
+analyze table srcpart_small compute statistics for columns;
+
+set hive.cbo.returnpath.hiveop=true;
+
+create table srccc as select * from src;
+
+EXPLAIN select  /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = i.cstring);
+EXPLAIN select  /*+ semi(i, 3000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = i.cstring);
+
+explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1);
+
+set hive.cbo.returnpath.hiveop=false;
+
+explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1);
+
+set hive.cbo.enable=false;
+
+explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1);
+

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
new file mode 100644
index 0000000..bac9240
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
@@ -0,0 +1,899 @@
+PREHOOK: query: create table alltypesorc_int ( cint int, cstring string ) stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@alltypesorc_int
+POSTHOOK: query: create table alltypesorc_int ( cint int, cstring string ) stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@alltypesorc_int
+PREHOOK: query: create table srcpart_date (str string, value string) partitioned by (ds string ) stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: create table srcpart_date (str string, value string) partitioned by (ds string ) stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srcpart_date
+PREHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srcpart_small
+PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-08
+PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+PREHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@alltypesorc_int
+POSTHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@alltypesorc_int
+POSTHOOK: Lineage: alltypesorc_int.cint SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: alltypesorc_int.cstring SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ]
+PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+PREHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).str SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@srcpart_date@ds=2008-04-09
+POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).str SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@srcpart_small@ds=2008-04-09
+POSTHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).key1 SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).value1 SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: analyze table alltypesorc_int compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc_int
+PREHOOK: Output: default@alltypesorc_int
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table alltypesorc_int compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc_int
+POSTHOOK: Output: default@alltypesorc_int
+#### A masked pattern was here ####
+PREHOOK: query: analyze table srcpart_date compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart_date
+PREHOOK: Input: default@srcpart_date@ds=2008-04-08
+PREHOOK: Input: default@srcpart_date@ds=2008-04-09
+PREHOOK: Output: default@srcpart_date
+PREHOOK: Output: default@srcpart_date@ds=2008-04-08
+PREHOOK: Output: default@srcpart_date@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table srcpart_date compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart_date
+POSTHOOK: Input: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Input: default@srcpart_date@ds=2008-04-09
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+#### A masked pattern was here ####
+PREHOOK: query: analyze table srcpart_small compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart_small
+PREHOOK: Input: default@srcpart_small@ds=2008-04-08
+PREHOOK: Input: default@srcpart_small@ds=2008-04-09
+PREHOOK: Output: default@srcpart_small
+PREHOOK: Output: default@srcpart_small@ds=2008-04-08
+PREHOOK: Output: default@srcpart_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table srcpart_small compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart_small
+POSTHOOK: Input: default@srcpart_small@ds=2008-04-08
+POSTHOOK: Input: default@srcpart_small@ds=2008-04-09
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-08
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+#### A masked pattern was here ####
+PREHOOK: query: create table srccc as select * from src
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srccc
+POSTHOOK: query: create table srccc as select * from src
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srccc
+POSTHOOK: Lineage: srccc.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srccc.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: EXPLAIN select  /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = i.cstring)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select  /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = i.cstring)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Reducer 7 (BROADCAST_EDGE)
+        Map 8 <- Reducer 5 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
+        Reducer 3 <- Map 8 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
+        Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE)
+        Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 7 <- Map 6 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: i
+                  filterExpr: (cstring is not null and (cstring BETWEEN DynamicValue(RS_7_k_cstring_min) AND DynamicValue(RS_7_k_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_7_k_cstring_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (cstring is not null and (cstring BETWEEN DynamicValue(RS_7_k_cstring_min) AND DynamicValue(RS_7_k_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_7_k_cstring_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: cstring (type: string)
+                      outputColumnNames: cstring
+                      Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: cstring (type: string)
+                        sort order: +
+                        Map-reduce partition columns: cstring (type: string)
+                        Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 6 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: (str is not null and value is not null) (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (str is not null and value is not null) (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string), value (type: string)
+                      outputColumnNames: str, value
+                      Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: value (type: string)
+                        sort order: +
+                        Map-reduce partition columns: value (type: string)
+                        Statistics: Num rows: 2000 Data size: 356000 Basic stats: COMPLETE Column stats: COMPLETE
+                        value expressions: str (type: string)
+                      Select Operator
+                        expressions: value (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 2000 Data size: 182000 Basic stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=5000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 8 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_9_i_key1_min) AND DynamicValue(RS_9_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_9_i_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_9_i_key1_min) AND DynamicValue(RS_9_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_9_i_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: key1
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: key1 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: key1 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 cstring (type: string)
+                  1 value (type: string)
+                outputColumnNames: str
+                Statistics: Num rows: 3281 Data size: 285447 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  key expressions: str (type: string)
+                  sort order: +
+                  Map-reduce partition columns: str (type: string)
+                  Statistics: Num rows: 3281 Data size: 285447 Basic stats: COMPLETE Column stats: COMPLETE
+                Select Operator
+                  expressions: str (type: string)
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 3281 Data size: 285447 Basic stats: COMPLETE Column stats: COMPLETE
+                  Group By Operator
+                    aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=410)
+                    mode: hash
+                    outputColumnNames: _col0, _col1, _col2
+                    Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                      value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 str (type: string)
+                  1 key1 (type: string)
+                Statistics: Num rows: 16004 Data size: 128032 Basic stats: COMPLETE Column stats: PARTIAL
+                Select Operator
+                  Statistics: Num rows: 16004 Data size: 64016 Basic stats: COMPLETE Column stats: PARTIAL
+                  Group By Operator
+                    aggregations: count()
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                      value expressions: _col0 (type: bigint)
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: $f0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=410)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+        Reducer 7 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=5000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select  /*+ semi(i, 3000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = i.cstring)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select  /*+ semi(i, 3000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = i.cstring)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 6 <- Reducer 4 (BROADCAST_EDGE)
+        Map 7 <- Reducer 5 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+        Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: i
+                  filterExpr: cstring is not null (type: boolean)
+                  Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: cstring is not null (type: boolean)
+                    Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: cstring (type: string)
+                      outputColumnNames: cstring
+                      Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: cstring (type: string)
+                        sort order: +
+                        Map-reduce partition columns: cstring (type: string)
+                        Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: cstring (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=3000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+                      Select Operator
+                        expressions: cstring (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 9174 Data size: 643900 Basic stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=3000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 6 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_i_key1_min) AND DynamicValue(RS_3_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_i_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_i_key1_min) AND DynamicValue(RS_3_i_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_i_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: key1
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: key1 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: key1 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 7 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: (str is not null and (str BETWEEN DynamicValue(RS_3_i_str_min) AND DynamicValue(RS_3_i_str_max) and in_bloom_filter(str, DynamicValue(RS_3_i_str_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (str is not null and (str BETWEEN DynamicValue(RS_3_i_str_min) AND DynamicValue(RS_3_i_str_max) and in_bloom_filter(str, DynamicValue(RS_3_i_str_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: str
+                      Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: str (type: string)
+                        sort order: +
+                        Map-reduce partition columns: str (type: string)
+                        Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                     Inner Join 1 to 2
+                keys:
+                  0 cstring (type: string)
+                  1 key1 (type: string)
+                  2 str (type: string)
+                Statistics: Num rows: 16008 Data size: 128064 Basic stats: COMPLETE Column stats: PARTIAL
+                Select Operator
+                  Statistics: Num rows: 16008 Data size: 64032 Basic stats: COMPLETE Column stats: PARTIAL
+                  Group By Operator
+                    aggregations: count()
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                      value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: $f0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=3000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=3000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: str is not null (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: str is not null (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: str
+                      Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: str (type: string)
+                        sort order: +
+                        Map-reduce partition columns: str (type: string)
+                        Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: str (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: key1
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: key1 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: key1 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 str (type: string)
+                  1 key1 (type: string)
+                Statistics: Num rows: 9756 Data size: 78048 Basic stats: COMPLETE Column stats: PARTIAL
+                Select Operator
+                  Statistics: Num rows: 9756 Data size: 39024 Basic stats: COMPLETE Column stats: PARTIAL
+                  Group By Operator
+                    aggregations: count()
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                      value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: $f0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: str is not null (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: str is not null (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: _col0 (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_6_k_key1_min) AND DynamicValue(RS_6_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_6_k_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_6_k_key1_min) AND DynamicValue(RS_6_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_6_k_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                Statistics: Num rows: 9756 Data size: 78048 Basic stats: COMPLETE Column stats: PARTIAL
+                Group By Operator
+                  aggregations: count()
+                  mode: hash
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  Reduce Output Operator
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                    value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: str is not null (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: str is not null (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: str (type: string)
+                      sort order: +
+                      Map-reduce partition columns: str (type: string)
+                      Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 2000 Data size: 174000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000)
+                        mode: hash
+                        outputColumnNames: _col0, _col1, _col2
+                        Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          sort order: 
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                          value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+                    Reduce Output Operator
+                      key expressions: key1 (type: string)
+                      sort order: +
+                      Map-reduce partition columns: key1 (type: string)
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 str (type: string)
+                  1 key1 (type: string)
+                Statistics: Num rows: 9756 Data size: 78048 Basic stats: COMPLETE Column stats: PARTIAL
+                Group By Operator
+                  aggregations: count()
+                  mode: hash
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  Reduce Output Operator
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                    value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+


[5/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

Posted by gu...@apache.org.
HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/9d5d737d
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/9d5d737d
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/9d5d737d

Branch: refs/heads/master
Commit: 9d5d737db4f715a880f0d544d548a5ce370f602b
Parents: fa24d4b
Author: Gunther Hagleitner <gu...@apache.org>
Authored: Thu Apr 20 10:07:52 2017 -0700
Committer: Gunther Hagleitner <gu...@apache.org>
Committed: Thu Apr 20 10:07:52 2017 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |     2 +
 .../test/resources/testconfiguration.properties |     1 +
 .../hive/ql/optimizer/ConvertJoinMapJoin.java   |     4 +-
 .../DynamicPartitionPruningOptimization.java    |   102 +-
 .../calcite/translator/HiveOpConverter.java     |    24 +-
 .../hadoop/hive/ql/parse/CalcitePlanner.java    |    35 +
 .../hive/ql/parse/CalcitePlanner.java.orig      |  4188 +++++
 .../hadoop/hive/ql/parse/GenTezUtils.java       |    25 +-
 .../apache/hadoop/hive/ql/parse/HintParser.g    |     3 +
 .../hadoop/hive/ql/parse/ParseContext.java      |    25 +-
 .../apache/hadoop/hive/ql/parse/QBJoinTree.java |    16 +
 .../hadoop/hive/ql/parse/SemanticAnalyzer.java  |    63 +
 .../hive/ql/parse/SemanticAnalyzer.java.orig    | 13508 +++++++++++++++++
 .../hive/ql/parse/SemiJoinBranchInfo.java       |    45 +
 .../hadoop/hive/ql/parse/SemiJoinHint.java      |    43 +
 .../hadoop/hive/ql/parse/TaskCompiler.java      |     2 +-
 .../hadoop/hive/ql/parse/TezCompiler.java       |   137 +-
 .../hive/ql/plan/ExprNodeDynamicListDesc.java   |    15 +-
 .../apache/hadoop/hive/ql/plan/JoinDesc.java    |    18 +
 .../hive/ql/ppd/SyntheticJoinPredicate.java     |     6 +-
 .../ql/udf/generic/GenericUDAFBloomFilter.java  |    13 +
 .../test/queries/clientpositive/semijoin_hint.q |    54 +
 .../clientpositive/llap/semijoin_hint.q.out     |   899 ++
 23 files changed, 19107 insertions(+), 121 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 420d35e..b10b08e 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2892,6 +2892,8 @@ public class HiveConf extends Configuration {
             "Big table for runtime filteting should be of atleast this size"),
     TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD("hive.tez.dynamic.semijoin.reduction.threshold", (float) 0.50,
             "Only perform semijoin optimization if the estimated benefit at or above this fraction of the target table"),
+    TEZ_DYNAMIC_SEMIJOIN_REDUCTION_HINT_ONLY("hive.tez.dynamic.semijoin.reduction.hint.only", false,
+            "When true, only enforce semijoin when a hint is provided"),
     TEZ_SMB_NUMBER_WAVES(
         "hive.tez.smb.number.waves",
         (float) 0.5,

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index ed5ce9d..116d0eb 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -572,6 +572,7 @@ minillaplocal.query.files=acid_globallimit.q,\
   schema_evol_text_vecrow_table.q,\
   selectDistinctStar.q,\
   semijoin.q,\
+  semijoin_hint.q,\
   smb_cache.q,\
   special_character_in_tabnames_1.q,\
   sqlmerge.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
index db6b05b..637bc54 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
@@ -794,7 +794,7 @@ public class ConvertJoinMapJoin implements NodeProcessor {
       // The semijoin branch can potentially create a task level cycle
       // with the hashjoin except when it is dynamically partitioned hash
       // join which takes place in a separate task.
-      if (context.parseContext.getRsOpToTsOpMap().size() > 0
+      if (context.parseContext.getRsToSemiJoinBranchInfo().size() > 0
               && removeReduceSink) {
         removeCycleCreatingSemiJoinOps(mapJoinOp, parentSelectOpOfBigTableOp,
                 context.parseContext);
@@ -826,7 +826,7 @@ public class ConvertJoinMapJoin implements NodeProcessor {
       }
 
       ReduceSinkOperator rs = (ReduceSinkOperator) op;
-      TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(rs);
+      TableScanOperator ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
       if (ts == null) {
         // skip, no semijoin branch
         continue;

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
index 838cc69..eb3eba5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java
@@ -43,12 +43,7 @@ import org.apache.hadoop.hive.ql.lib.RuleRegExp;
 import org.apache.hadoop.hive.ql.metadata.Partition;
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc;
-import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext;
-import org.apache.hadoop.hive.ql.parse.ParseContext;
-import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
-import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo;
-import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
-import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.*;
 import org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext;
 import org.apache.hadoop.hive.ql.plan.*;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator;
@@ -215,16 +210,25 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
         } else {
           LOG.debug("Column " + column + " is not a partition column");
           if (semiJoin && ts.getConf().getFilterExpr() != null) {
-            LOG.debug("Initiate semijoin reduction for " + column);
-            // Get the table name from which the min-max values will come.
+            LOG.debug("Initiate semijoin reduction for " + column + " ("
+                + ts.getConf().getFilterExpr().getExprString());
+            // Get the table name from which the min-max values and bloom filter will come.
             Operator<?> op = ctx.generator;
+
             while (!(op == null || op instanceof TableScanOperator)) {
               op = op.getParentOperators().get(0);
             }
             String tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias());
+
+            Map<String, SemiJoinHint> hints = ctx.desc.getHints();
+            SemiJoinHint sjHint = (hints != null) ? hints.get(tableAlias) : null;
             keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + column;
 
-            semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias);
+            semiJoinAttempted = generateSemiJoinOperatorPlan(
+                ctx, parseContext, ts, keyBaseAlias, sjHint);
+            if (!semiJoinAttempted && sjHint != null) {
+              throw new SemanticException("The user hint to enforce semijoin failed required conditions");
+            }
           }
         }
 
@@ -387,7 +391,13 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
 
   // Generates plan for min/max when dynamic partition pruning is ruled out.
   private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext,
-      TableScanOperator ts, String keyBaseAlias) throws SemanticException {
+      TableScanOperator ts, String keyBaseAlias, SemiJoinHint sjHint) throws SemanticException {
+
+    // If semijoin hint is enforced, make sure hint is provided
+    if (parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_HINT_ONLY)
+            && sjHint == null) {
+        return false;
+    }
 
     // we will put a fork in the plan at the source of the reduce sink
     Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
@@ -441,6 +451,14 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
       }
     }
 
+    // If hint is provided and only hinted semijoin optimizations should be
+    // created, then skip other columns on the table
+    if (parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_HINT_ONLY)
+            && sjHint.getColName() != null &&
+            !internalColName.equals(sjHint.getColName())) {
+      return false;
+    }
+
     List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
     keyExprs.add(key);
 
@@ -484,8 +502,6 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
             HiveConf.getFloatVar(parseContext.getConf(),
                     HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
 
-    ArrayList<ExprNodeDesc> groupByExprs = new ArrayList<ExprNodeDesc>();
-
     // Add min/max and bloom filter aggregations
     List<ObjectInspector> aggFnOIs = new ArrayList<ObjectInspector>();
     aggFnOIs.add(key.getWritableObjectInspector());
@@ -505,8 +521,14 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
       AggregationDesc bloomFilter = new AggregationDesc("bloom_filter",
               FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false),
               params, false, Mode.PARTIAL1);
-      GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
+      GenericUDAFBloomFilterEvaluator bloomFilterEval =
+          (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
       bloomFilterEval.setSourceOperator(selectOp);
+
+      if (sjHint != null && sjHint.getNumEntries() > 0) {
+        LOG.debug("Setting size for " + keyBaseAlias + " to " + sjHint.getNumEntries() + " based on the hint");
+        bloomFilterEval.setHintEntries(sjHint.getNumEntries());
+      }
       bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
       bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
       bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
@@ -607,6 +629,9 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
               bloomFilterFinalParams, false, Mode.FINAL);
       GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
       bloomFilterEval.setSourceOperator(selectOp);
+      if (sjHint != null && sjHint.getNumEntries() > 0) {
+        bloomFilterEval.setHintEntries(sjHint.getNumEntries());
+      }
       bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES));
       bloomFilterEval.setMinEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MIN_BLOOM_FILTER_ENTRIES));
       bloomFilterEval.setFactor(parseContext.getConf().getFloatVar(ConfVars.TEZ_BLOOM_FILTER_FACTOR));
@@ -635,23 +660,56 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
       rsOp.getConf().setOutputOperators(outputOperators);
     }
 
+    createFinalRsForSemiJoinOp(parseContext, ts, groupByOpFinal, key,
+            keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
+
+    return true;
+  }
+
+  private void createFinalRsForSemiJoinOp(
+          ParseContext parseContext, TableScanOperator ts, GroupByOperator gb,
+          ExprNodeDesc key, String keyBaseAlias, ExprNodeDesc colExpr,
+          boolean isHint) throws SemanticException {
+    ArrayList<String> gbOutputNames = new ArrayList<>();
+    // One each for min, max and bloom filter
+    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0));
+    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1));
+    gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2));
+
+    int colPos = 0;
+    ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
+    for (int i = 0; i < gbOutputNames.size() - 1; i++) {
+      ExprNodeColumnDesc expr = new ExprNodeColumnDesc(key.getTypeInfo(),
+              gbOutputNames.get(colPos++), "", false);
+      rsValueCols.add(expr);
+    }
+
+    // Bloom Filter uses binary
+    ExprNodeColumnDesc colBFExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo,
+            gbOutputNames.get(colPos++), "", false);
+    rsValueCols.add(colBFExpr);
+
     // Create the final Reduce Sink Operator
     ReduceSinkDesc rsDescFinal = PlanUtils.getReduceSinkDesc(
             new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false,
             -1, 0, 1, Operation.NOT_ACID);
     ReduceSinkOperator rsOpFinal = (ReduceSinkOperator)OperatorFactory.getAndMakeChild(
-            rsDescFinal, new RowSchema(groupByOpFinal.getSchema()), groupByOpFinal);
+            rsDescFinal, new RowSchema(gb.getSchema()), gb);
+    Map<String, ExprNodeDesc> columnExprMap = new HashMap<>();
     rsOpFinal.setColumnExprMap(columnExprMap);
 
-    LOG.debug("DynamicMinMaxPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts);
-    parseContext.getRsOpToTsOpMap().put(rsOpFinal, ts);
+    LOG.debug("DynamicSemiJoinPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts);
+    SemiJoinBranchInfo sjInfo = new SemiJoinBranchInfo(ts, isHint);
+    parseContext.getRsToSemiJoinBranchInfo().put(rsOpFinal, sjInfo);
 
     // for explain purpose
-    if (parseContext.getContext().getExplainConfig() != null
-        && parseContext.getContext().getExplainConfig().isFormatted()) {
-      List<String> outputOperators = new ArrayList<>();
+    if (parseContext.getContext().getExplainConfig() != null &&
+            parseContext.getContext().getExplainConfig().isFormatted()) {
+      List<String> outputOperators = rsOpFinal.getConf().getOutputOperators();
+      if (outputOperators == null) {
+        outputOperators = new ArrayList<>();
+      }
       outputOperators.add(ts.getOperatorId());
-      rsOpFinal.getConf().setOutputOperators(outputOperators);
     }
 
     // Save the info that is required at query time to resolve dynamic/runtime values.
@@ -666,10 +724,8 @@ public class DynamicPartitionPruningOptimization implements NodeProcessor {
     runtimeValuesInfo.setTableDesc(rsFinalTableDesc);
     runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs);
     runtimeValuesInfo.setColExprs(rsValueCols);
-    runtimeValuesInfo.setTsColExpr(ctx.parent.getChildren().get(0));
+    runtimeValuesInfo.setTsColExpr(colExpr);
     parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo);
-
-    return true;
   }
 
   private Map<Node, Object> collectDynamicPruningConditions(ExprNodeDesc pred, NodeProcessorCtx ctx)

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
index 73a9b0f..d375d1b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
@@ -19,6 +19,8 @@
 package org.apache.hadoop.hive.ql.optimizer.calcite.translator;
 
 
+import org.apache.hadoop.hive.ql.parse.*;
+
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -72,19 +74,8 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortExchange
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion;
-import org.apache.hadoop.hive.ql.parse.JoinCond;
-import org.apache.hadoop.hive.ql.parse.JoinType;
-import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec;
 import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression;
 import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
-import org.apache.hadoop.hive.ql.parse.PTFTranslator;
-import org.apache.hadoop.hive.ql.parse.ParseUtils;
-import org.apache.hadoop.hive.ql.parse.RowResolver;
-import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
-import org.apache.hadoop.hive.ql.parse.SemanticException;
-import org.apache.hadoop.hive.ql.parse.UnparseTranslator;
-import org.apache.hadoop.hive.ql.parse.WindowingComponentizer;
-import org.apache.hadoop.hive.ql.parse.WindowingSpec;
 import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec;
 import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
@@ -348,6 +339,9 @@ public class HiveOpConverter {
     // through Hive
     String[] baseSrc = new String[joinRel.getInputs().size()];
     String tabAlias = getHiveDerivedTableAlias();
+    Map<String, SemiJoinHint> semiJoinHints = semanticAnalyzer.parseSemiJoinHint(
+        semanticAnalyzer.getQB().getParseInfo().getHints());
+
     // 1. Convert inputs
     OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()];
     List<Operator<?>> children = new ArrayList<Operator<?>>(joinRel.getInputs().size());
@@ -413,7 +407,7 @@ public class HiveOpConverter {
 
     // 6. Generate Join operator
     JoinOperator joinOp = genJoin(joinRel, joinExpressions, filterExpressions, children,
-            baseSrc, tabAlias);
+            baseSrc, tabAlias, semiJoinHints);
 
     // 7. Return result
     return new OpAttr(tabAlias, newVcolsInCalcite, joinOp);
@@ -726,7 +720,7 @@ public class HiveOpConverter {
       List<String> keepColNames) throws SemanticException {
     // 1. Generate RS operator
     // 1.1 Prune the tableNames, only count the tableNames that are not empty strings
-	// as empty string in table aliases is only allowed for virtual columns.
+  // as empty string in table aliases is only allowed for virtual columns.
     String tableAlias = null;
     Set<String> tableNames = input.getSchema().getTableNames();
     for (String tableName : tableNames) {
@@ -885,7 +879,8 @@ public class HiveOpConverter {
 
   private static JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressions,
       List<List<ExprNodeDesc>> filterExpressions, List<Operator<?>> children,
-      String[] baseSrc, String tabAlias) throws SemanticException {
+      String[] baseSrc, String tabAlias, Map<String, SemiJoinHint> semiJoinHints)
+          throws SemanticException {
 
     // 1. Extract join type
     JoinCondDesc[] joinCondns;
@@ -1011,6 +1006,7 @@ public class HiveOpConverter {
     // 4. We create the join operator with its descriptor
     JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, noOuterJoin, joinCondns,
             filters, joinExpressions);
+    desc.setSemiJoinHints(semiJoinHints);
     desc.setReversedExprs(reversedExprs);
     desc.setFilterMap(filterMap);
 

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index c97b3e7..d10b6bf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hive.ql.parse;
 
+import org.antlr.runtime.tree.Tree;
+
 import java.io.IOException;
 import java.lang.reflect.Field;
 import java.lang.reflect.InvocationTargetException;
@@ -332,6 +334,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
       skipCalcitePlan = true;
     } else {
       PreCboCtx cboCtx = (PreCboCtx) plannerCtx;
+      ASTNode oldHints = getQB().getParseInfo().getHints();
 
       // Note: for now, we don't actually pass the queryForCbo to CBO, because
       // it accepts qb, not AST, and can also access all the private stuff in
@@ -399,6 +402,15 @@ public class CalcitePlanner extends SemanticAnalyzer {
                 newAST = reAnalyzeCTASAfterCbo(newAST);
               }
             }
+            if (oldHints != null) {
+              if (getQB().getParseInfo().getHints() != null) {
+                LOG.warn("Hints are not null in the optimized tree; before CBO " + oldHints.dump()
+                    + "; after CBO " + getQB().getParseInfo().getHints().dump());
+              } else {
+                LOG.debug("Propagating hints to QB: " + oldHints);
+                getQB().getParseInfo().setHints(oldHints);
+              }
+            }
             Phase1Ctx ctx_1 = initPhase1Ctx();
             if (!doPhase1(newAST, getQB(), ctx_1, null)) {
               throw new RuntimeException("Couldn't do phase1 on CBO optimized query plan");
@@ -3462,6 +3474,24 @@ public class CalcitePlanner extends SemanticAnalyzer {
       return selRel;
     }
 
+    private void setQueryHints(QB qb) throws SemanticException {
+      QBParseInfo qbp = getQBParseInfo(qb);
+      String selClauseName = qbp.getClauseNames().iterator().next();
+      Tree selExpr0 = qbp.getSelForClause(selClauseName).getChild(0);
+
+      if (selExpr0.getType() != HiveParser.QUERY_HINT) return;
+      String hint = ctx.getTokenRewriteStream().toString(
+          selExpr0.getTokenStartIndex(), selExpr0.getTokenStopIndex());
+      LOG.debug("Handling query hints: " + hint);
+      ParseDriver pd = new ParseDriver();
+      try {
+        ASTNode hintNode = pd.parseHint(hint);
+        qbp.setHints((ASTNode) hintNode);
+      } catch (ParseException e) {
+        throw new SemanticException("failed to parse query hint: "+e.getMessage(), e);
+      }
+    }
+
     /**
      * NOTE: there can only be one select caluse since we don't handle multi
      * destination insert.
@@ -3960,7 +3990,12 @@ public class CalcitePlanner extends SemanticAnalyzer {
         throw new CalciteSemanticException("Unsupported", UnsupportedFeature.Others);
 
       }
+
       // 1.3 process join
+      // 1.3.1 process hints
+      setQueryHints(qb);
+
+      // 1.3.2 process the actual join
       if (qb.getParseInfo().getJoinExpr() != null) {
         srcRel = genJoinLogicalPlan(qb.getParseInfo().getJoinExpr(), aliasToRel);
       } else {


[4/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

Posted by gu...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig
new file mode 100644
index 0000000..c97b3e7
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java.orig
@@ -0,0 +1,4188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.parse;
+
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.UndeclaredThrowableException;
+import java.math.BigDecimal;
+import java.util.AbstractMap.SimpleEntry;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.antlr.runtime.ClassicToken;
+import org.antlr.runtime.CommonToken;
+import org.antlr.runtime.tree.TreeVisitor;
+import org.antlr.runtime.tree.TreeVisitorAction;
+import org.apache.calcite.adapter.druid.DruidQuery;
+import org.apache.calcite.adapter.druid.DruidRules;
+import org.apache.calcite.adapter.druid.DruidSchema;
+import org.apache.calcite.adapter.druid.DruidTable;
+import org.apache.calcite.adapter.druid.LocalInterval;
+import org.apache.calcite.config.CalciteConnectionConfigImpl;
+import org.apache.calcite.config.CalciteConnectionProperty;
+import org.apache.calcite.plan.RelOptCluster;
+import org.apache.calcite.plan.RelOptMaterialization;
+import org.apache.calcite.plan.RelOptPlanner;
+import org.apache.calcite.plan.RelOptRule;
+import org.apache.calcite.plan.RelOptSchema;
+import org.apache.calcite.plan.RelOptUtil;
+import org.apache.calcite.plan.RelTraitSet;
+import org.apache.calcite.plan.hep.HepMatchOrder;
+import org.apache.calcite.plan.hep.HepPlanner;
+import org.apache.calcite.plan.hep.HepProgram;
+import org.apache.calcite.plan.hep.HepProgramBuilder;
+import org.apache.calcite.rel.RelCollation;
+import org.apache.calcite.rel.RelCollationImpl;
+import org.apache.calcite.rel.RelCollations;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.Aggregate;
+import org.apache.calcite.rel.core.AggregateCall;
+import org.apache.calcite.rel.core.Filter;
+import org.apache.calcite.rel.core.JoinRelType;
+import org.apache.calcite.rel.core.SetOp;
+import org.apache.calcite.rel.core.TableScan;
+import org.apache.calcite.rel.metadata.CachingRelMetadataProvider;
+import org.apache.calcite.rel.metadata.ChainedRelMetadataProvider;
+import org.apache.calcite.rel.metadata.DefaultRelMetadataProvider;
+import org.apache.calcite.rel.metadata.JaninoRelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.rel.rules.FilterMergeRule;
+import org.apache.calcite.rel.rules.JoinToMultiJoinRule;
+import org.apache.calcite.rel.rules.LoptOptimizeJoinRule;
+import org.apache.calcite.rel.rules.ProjectMergeRule;
+import org.apache.calcite.rel.rules.ProjectRemoveRule;
+import org.apache.calcite.rel.rules.SemiJoinFilterTransposeRule;
+import org.apache.calcite.rel.rules.SemiJoinJoinTransposeRule;
+import org.apache.calcite.rel.rules.SemiJoinProjectTransposeRule;
+import org.apache.calcite.rel.rules.UnionMergeRule;
+import org.apache.calcite.rel.type.RelDataType;
+import org.apache.calcite.rel.type.RelDataTypeFactory;
+import org.apache.calcite.rel.type.RelDataTypeField;
+import org.apache.calcite.rel.type.RelDataTypeImpl;
+import org.apache.calcite.rex.RexBuilder;
+import org.apache.calcite.rex.RexExecutor;
+import org.apache.calcite.rex.RexFieldCollation;
+import org.apache.calcite.rex.RexInputRef;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexUtil;
+import org.apache.calcite.rex.RexWindowBound;
+import org.apache.calcite.schema.SchemaPlus;
+import org.apache.calcite.sql.SqlAggFunction;
+import org.apache.calcite.sql.SqlCall;
+import org.apache.calcite.sql.SqlExplainLevel;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.SqlLiteral;
+import org.apache.calcite.sql.SqlNode;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.calcite.sql.SqlWindow;
+import org.apache.calcite.sql.parser.SqlParserPos;
+import org.apache.calcite.sql.type.SqlTypeName;
+import org.apache.calcite.tools.Frameworks;
+import org.apache.calcite.util.CompositeList;
+import org.apache.calcite.util.ImmutableBitSet;
+import org.apache.calcite.util.ImmutableIntList;
+import org.apache.calcite.util.Pair;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.conf.Constants;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.conf.HiveConf.StrictChecks;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.QueryProperties;
+import org.apache.hadoop.hive.ql.QueryState;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.log.PerfLogger;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSubquerySemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteViewSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveDefaultRelMetadataProvider;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HivePlannerContext;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRexExecutorImpl;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
+import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil;
+import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveAlgorithmsConf;
+import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveVolcanoPlanner;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExcept;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIntersect;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSortLimit;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableFunctionScan;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveUnion;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateJoinTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregateProjectMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveAggregatePullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveExceptRewriteRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveExpandDistinctAggregatesRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterAggregateTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterJoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterProjectTSTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterProjectTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterSetOpTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveFilterSortTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveInsertExchange4JoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveIntersectMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveIntersectRewriteRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinAddNotNullRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinCommuteRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinProjectTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinPushTransitivePredicatesRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveJoinToMultiJoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePartitionPruneRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePreFilteringRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectFilterPullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectOverIntersectRemoveRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveProjectSortTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveReduceExpressionsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveReduceExpressionsWithStatsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelDecorrelator;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelFieldTrimmer;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortLimitPullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortMergeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortProjectTransposeRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortRemoveRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortUnionReduceRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSubQueryRemoveRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionPullUpConstantsRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveWindowingFixRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveMaterializedViewFilterScanRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTBuilder;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.JoinCondTypeCheckProcFactory;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.JoinTypeCheckCtx;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.PlanModifierForReturnPath;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.RexNodeConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec;
+import org.apache.hadoop.hive.ql.parse.QBExpr.Opcode;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowType;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.HiveOperation;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableList.Builder;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import org.apache.calcite.config.CalciteConnectionConfig;
+
+public class CalcitePlanner extends SemanticAnalyzer {
+
+  private final AtomicInteger noColsMissingStats = new AtomicInteger(0);
+  private SemanticException semanticException;
+  private boolean runCBO = true;
+  private boolean disableSemJoinReordering = true;
+  private EnumSet<ExtendedCBOProfile> profilesCBO;
+
+  public CalcitePlanner(QueryState queryState) throws SemanticException {
+    super(queryState);
+    if (!HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_ENABLED)) {
+      runCBO = false;
+      disableSemJoinReordering = false;
+    }
+  }
+
+  public void resetCalciteConfiguration() {
+    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_ENABLED)) {
+      runCBO = true;
+      disableSemJoinReordering = true;
+    }
+  }
+
+  @Override
+  @SuppressWarnings("nls")
+  public void analyzeInternal(ASTNode ast) throws SemanticException {
+    if (runCBO) {
+      PreCboCtx cboCtx = new PreCboCtx();
+      super.analyzeInternal(ast, cboCtx);
+    } else {
+      super.analyzeInternal(ast);
+    }
+  }
+
+  /**
+   * This method is useful if we want to obtain the logical plan after being parsed and
+   * optimized by Calcite.
+   *
+   * @return the Calcite plan for the query, null if it could not be generated
+   */
+  public RelNode genLogicalPlan(ASTNode ast) throws SemanticException {
+    LOG.info("Starting generating logical plan");
+    PreCboCtx cboCtx = new PreCboCtx();
+    //change the location of position alias process here
+    processPositionAlias(ast);
+    if (!genResolvedParseTree(ast, cboCtx)) {
+      return null;
+    }
+    ASTNode queryForCbo = ast;
+    if (cboCtx.type == PreCboCtx.Type.CTAS || cboCtx.type == PreCboCtx.Type.VIEW) {
+      queryForCbo = cboCtx.nodeOfInterest; // nodeOfInterest is the query
+    }
+    runCBO = canCBOHandleAst(queryForCbo, getQB(), cboCtx);
+    if (!runCBO) {
+      return null;
+    }
+    profilesCBO = obtainCBOProfiles(queryProperties);
+    disableJoinMerge = true;
+    final RelNode resPlan = logicalPlan();
+    LOG.info("Finished generating logical plan");
+    return resPlan;
+  }
+
+  @Override
+  @SuppressWarnings("rawtypes")
+  Operator genOPTree(ASTNode ast, PlannerContext plannerCtx) throws SemanticException {
+    Operator sinkOp = null;
+    boolean skipCalcitePlan = false;
+
+    if (!runCBO) {
+      skipCalcitePlan = true;
+    } else {
+      PreCboCtx cboCtx = (PreCboCtx) plannerCtx;
+
+      // Note: for now, we don't actually pass the queryForCbo to CBO, because
+      // it accepts qb, not AST, and can also access all the private stuff in
+      // SA. We rely on the fact that CBO ignores the unknown tokens (create
+      // table, destination), so if the query is otherwise ok, it is as if we
+      // did remove those and gave CBO the proper AST. That is kinda hacky.
+      ASTNode queryForCbo = ast;
+      if (cboCtx.type == PreCboCtx.Type.CTAS || cboCtx.type == PreCboCtx.Type.VIEW) {
+        queryForCbo = cboCtx.nodeOfInterest; // nodeOfInterest is the query
+      }
+      runCBO = canCBOHandleAst(queryForCbo, getQB(), cboCtx);
+      if (queryProperties.hasMultiDestQuery()) {
+        handleMultiDestQuery(ast, cboCtx);
+      }
+
+      if (runCBO) {
+        profilesCBO = obtainCBOProfiles(queryProperties);
+
+        disableJoinMerge = true;
+        boolean reAnalyzeAST = false;
+        final boolean materializedView = getQB().isMaterializedView();
+
+        try {
+          if (this.conf.getBoolVar(HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) {
+            if (cboCtx.type == PreCboCtx.Type.VIEW && !materializedView) {
+              throw new SemanticException("Create view is not supported in cbo return path.");
+            }
+            sinkOp = getOptimizedHiveOPDag();
+            LOG.info("CBO Succeeded; optimized logical plan.");
+            this.ctx.setCboInfo("Plan optimized by CBO.");
+            this.ctx.setCboSucceeded(true);
+          } else {
+            // 1. Gen Optimized AST
+            ASTNode newAST = getOptimizedAST();
+
+            // 1.1. Fix up the query for insert/ctas/materialized views
+            newAST = fixUpAfterCbo(ast, newAST, cboCtx);
+
+            // 2. Regen OP plan from optimized AST
+            if (cboCtx.type == PreCboCtx.Type.VIEW && !materializedView) {
+              try {
+                handleCreateViewDDL(newAST);
+              } catch (SemanticException e) {
+                throw new CalciteViewSemanticException(e.getMessage());
+              }
+            } else {
+              init(false);
+              if (cboCtx.type == PreCboCtx.Type.VIEW && materializedView) {
+                // Redo create-table/view analysis, because it's not part of
+                // doPhase1.
+                // Use the REWRITTEN AST
+                setAST(newAST);
+                newAST = reAnalyzeViewAfterCbo(newAST);
+                // Store text of the ORIGINAL QUERY
+                String originalText = ctx.getTokenRewriteStream().toString(
+                    cboCtx.nodeOfInterest.getTokenStartIndex(),
+                    cboCtx.nodeOfInterest.getTokenStopIndex());
+                createVwDesc.setViewOriginalText(originalText);
+                viewSelect = newAST;
+                viewsExpanded = new ArrayList<>();
+                viewsExpanded.add(createVwDesc.getViewName());
+              } else if (cboCtx.type == PreCboCtx.Type.CTAS) {
+                // CTAS
+                setAST(newAST);
+                newAST = reAnalyzeCTASAfterCbo(newAST);
+              }
+            }
+            Phase1Ctx ctx_1 = initPhase1Ctx();
+            if (!doPhase1(newAST, getQB(), ctx_1, null)) {
+              throw new RuntimeException("Couldn't do phase1 on CBO optimized query plan");
+            }
+            // unfortunately making prunedPartitions immutable is not possible
+            // here with SemiJoins not all tables are costed in CBO, so their
+            // PartitionList is not evaluated until the run phase.
+            getMetaData(getQB());
+
+            disableJoinMerge = defaultJoinMerge;
+            sinkOp = genPlan(getQB());
+            LOG.info("CBO Succeeded; optimized logical plan.");
+            this.ctx.setCboInfo("Plan optimized by CBO.");
+            this.ctx.setCboSucceeded(true);
+            if (LOG.isTraceEnabled()) {
+              LOG.trace(newAST.dump());
+            }
+          }
+        } catch (Exception e) {
+          boolean isMissingStats = noColsMissingStats.get() > 0;
+          if (isMissingStats) {
+            LOG.error("CBO failed due to missing column stats (see previous errors), skipping CBO");
+            this.ctx
+                .setCboInfo("Plan not optimized by CBO due to missing statistics. Please check log for more details.");
+          } else {
+            LOG.error("CBO failed, skipping CBO. ", e);
+            if (e instanceof CalciteSemanticException) {
+              CalciteSemanticException calciteSemanticException = (CalciteSemanticException) e;
+              UnsupportedFeature unsupportedFeature = calciteSemanticException
+                  .getUnsupportedFeature();
+              if (unsupportedFeature != null) {
+                this.ctx.setCboInfo("Plan not optimized by CBO due to missing feature ["
+                    + unsupportedFeature + "].");
+              } else {
+                this.ctx.setCboInfo("Plan not optimized by CBO.");
+              }
+            } else {
+              this.ctx.setCboInfo("Plan not optimized by CBO.");
+            }
+          }
+          if( e instanceof CalciteSubquerySemanticException) {
+            // non-cbo path retries to execute subqueries and throws completely different exception/error
+            // to eclipse the original error message
+            // so avoid executing subqueries on non-cbo
+            throw new SemanticException(e);
+          }
+          else if( e instanceof CalciteViewSemanticException) {
+            // non-cbo path retries to execute create view and
+            // we believe it will throw the same error message
+            throw new SemanticException(e);
+          }
+          else if (!conf.getBoolVar(ConfVars.HIVE_IN_TEST) || isMissingStats
+              || e instanceof CalciteSemanticException ) {
+              reAnalyzeAST = true;
+          } else if (e instanceof SemanticException) {
+            // although, its likely to be a valid exception, we will retry
+            // with cbo off anyway.
+              reAnalyzeAST = true;
+          } else if (e instanceof RuntimeException) {
+            throw (RuntimeException) e;
+          } else {
+            throw new SemanticException(e);
+          }
+        } finally {
+          runCBO = false;
+          disableJoinMerge = defaultJoinMerge;
+          disableSemJoinReordering = false;
+          if (reAnalyzeAST) {
+            init(true);
+            prunedPartitions.clear();
+            // Assumption: At this point Parse Tree gen & resolution will always
+            // be true (since we started out that way).
+            super.genResolvedParseTree(ast, new PlannerContext());
+            skipCalcitePlan = true;
+          }
+        }
+      } else {
+        this.ctx.setCboInfo("Plan not optimized by CBO.");
+        skipCalcitePlan = true;
+      }
+    }
+
+    if (skipCalcitePlan) {
+      sinkOp = super.genOPTree(ast, plannerCtx);
+    }
+
+    return sinkOp;
+  }
+
+  private void handleCreateViewDDL(ASTNode newAST) throws SemanticException {
+    saveViewDefinition();
+    String originalText = createVwDesc.getViewOriginalText();
+    String expandedText = createVwDesc.getViewExpandedText();
+    List<FieldSchema> schema = createVwDesc.getSchema();
+    List<FieldSchema> partitionColumns = createVwDesc.getPartCols();
+    init(false);
+    setAST(newAST);
+    newAST = reAnalyzeViewAfterCbo(newAST);
+    createVwDesc.setViewOriginalText(originalText);
+    createVwDesc.setViewExpandedText(expandedText);
+    createVwDesc.setSchema(schema);
+    createVwDesc.setPartCols(partitionColumns);
+  }
+
+  /*
+   * Tries to optimize FROM clause of multi-insert. No attempt to optimize insert clauses of the query.
+   * Returns true if rewriting is successful, false otherwise.
+   */
+  private void handleMultiDestQuery(ASTNode ast, PreCboCtx cboCtx) throws SemanticException {
+    // Not supported by CBO
+    if (!runCBO) {
+      return;
+    }
+    // Currently, we only optimized the query the content of the FROM clause
+    // for multi-insert queries. Thus, nodeOfInterest is the FROM clause
+    if (isJoinToken(cboCtx.nodeOfInterest)) {
+      // Join clause: rewriting is needed
+      ASTNode subq = rewriteASTForMultiInsert(ast, cboCtx.nodeOfInterest);
+      if (subq != null) {
+        // We could rewrite into a subquery
+        cboCtx.nodeOfInterest = (ASTNode) subq.getChild(0);
+        QB newQB = new QB(null, "", false);
+        Phase1Ctx ctx_1 = initPhase1Ctx();
+        doPhase1(cboCtx.nodeOfInterest, newQB, ctx_1, null);
+        setQB(newQB);
+        getMetaData(getQB());
+      } else {
+        runCBO = false;
+      }
+    } else if (cboCtx.nodeOfInterest.getToken().getType() == HiveParser.TOK_SUBQUERY) {
+      // Subquery: no rewriting needed
+      ASTNode subq = cboCtx.nodeOfInterest;
+      // First child is subquery, second child is alias
+      // We set the node of interest and QB to the subquery
+      // We do not need to generate the QB again, but rather we use it directly
+      cboCtx.nodeOfInterest = (ASTNode) subq.getChild(0);
+      String subQAlias = unescapeIdentifier(subq.getChild(1).getText());
+      final QB newQB = getQB().getSubqForAlias(subQAlias).getQB();
+      newQB.getParseInfo().setAlias("");
+      newQB.getParseInfo().setIsSubQ(false);
+      setQB(newQB);
+    } else {
+      // No need to run CBO (table ref or virtual table) or not supported
+      runCBO = false;
+    }
+  }
+
+  private ASTNode rewriteASTForMultiInsert(ASTNode query, ASTNode nodeOfInterest) {
+    // 1. gather references from original query
+    // This is a map from aliases to references.
+    // We keep all references as we will need to modify them after creating
+    // the subquery
+    final Multimap<String, Object> aliasNodes = ArrayListMultimap.create();
+    // To know if we need to bail out
+    final AtomicBoolean notSupported = new AtomicBoolean(false);
+    TreeVisitorAction action = new TreeVisitorAction() {
+      @Override
+      public Object pre(Object t) {
+        if (!notSupported.get()) {
+          if (ParseDriver.adaptor.getType(t) == HiveParser.TOK_ALLCOLREF) {
+            // TODO: this is a limitation of the AST rewriting approach that we will
+            // not be able to overcome till proper integration of full multi-insert
+            // queries with Calcite is implemented.
+            // The current rewriting gather references from insert clauses and then
+            // updates them with the new subquery references. However, if insert
+            // clauses use * or tab.*, we cannot resolve the columns that we are
+            // referring to. Thus, we just bail out and those queries will not be
+            // currently optimized by Calcite.
+            // An example of such query is:
+            // FROM T_A a LEFT JOIN T_B b ON a.id = b.id
+            // INSERT OVERWRITE TABLE join_result_1
+            // SELECT a.*, b.*
+            // INSERT OVERWRITE TABLE join_result_3
+            // SELECT a.*, b.*;
+            notSupported.set(true);
+          } else if (ParseDriver.adaptor.getType(t) == HiveParser.DOT) {
+            Object c = ParseDriver.adaptor.getChild(t, 0);
+            if (c != null && ParseDriver.adaptor.getType(c) == HiveParser.TOK_TABLE_OR_COL) {
+              aliasNodes.put(((ASTNode) t).toStringTree(), t);
+            }
+          } else if (ParseDriver.adaptor.getType(t) == HiveParser.TOK_TABLE_OR_COL) {
+            Object p = ParseDriver.adaptor.getParent(t);
+            if (p == null || ParseDriver.adaptor.getType(p) != HiveParser.DOT) {
+              aliasNodes.put(((ASTNode) t).toStringTree(), t);
+            }
+          }
+        }
+        return t;
+      }
+      @Override
+      public Object post(Object t) {
+        return t;
+      }
+    };
+    TreeVisitor tv = new TreeVisitor(ParseDriver.adaptor);
+    // We will iterate through the children: if it is an INSERT, we will traverse
+    // the subtree to gather the references
+    for (int i = 0; i < query.getChildCount(); i++) {
+      ASTNode child = (ASTNode) query.getChild(i);
+      if (ParseDriver.adaptor.getType(child) != HiveParser.TOK_INSERT) {
+        // If it is not an INSERT, we do not need to anything
+        continue;
+      }
+      tv.visit(child, action);
+    }
+    if (notSupported.get()) {
+      // Bail out
+      return null;
+    }
+    // 2. rewrite into query
+    //  TOK_QUERY
+    //     TOK_FROM
+    //        join
+    //     TOK_INSERT
+    //        TOK_DESTINATION
+    //           TOK_DIR
+    //              TOK_TMP_FILE
+    //        TOK_SELECT
+    //           refs
+    ASTNode from = new ASTNode(new CommonToken(HiveParser.TOK_FROM, "TOK_FROM"));
+    from.addChild((ASTNode) ParseDriver.adaptor.dupTree(nodeOfInterest));
+    ASTNode destination = new ASTNode(new CommonToken(HiveParser.TOK_DESTINATION, "TOK_DESTINATION"));
+    ASTNode dir = new ASTNode(new CommonToken(HiveParser.TOK_DIR, "TOK_DIR"));
+    ASTNode tmpFile = new ASTNode(new CommonToken(HiveParser.TOK_TMP_FILE, "TOK_TMP_FILE"));
+    dir.addChild(tmpFile);
+    destination.addChild(dir);
+    ASTNode select = new ASTNode(new CommonToken(HiveParser.TOK_SELECT, "TOK_SELECT"));
+    int num = 0;
+    for (Collection<Object> selectIdentifier : aliasNodes.asMap().values()) {
+      Iterator<Object> it = selectIdentifier.iterator();
+      ASTNode node = (ASTNode) it.next();
+      // Add select expression
+      ASTNode selectExpr = new ASTNode(new CommonToken(HiveParser.TOK_SELEXPR, "TOK_SELEXPR"));
+      selectExpr.addChild((ASTNode) ParseDriver.adaptor.dupTree(node)); // Identifier
+      String colAlias = "col" + num;
+      selectExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias))); // Alias
+      select.addChild(selectExpr);
+      // Rewrite all INSERT references (all the node values for this key)
+      ASTNode colExpr = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL"));
+      colExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias)));
+      replaceASTChild(node, colExpr);
+      while (it.hasNext()) {
+        // Loop to rewrite rest of INSERT references
+        node = (ASTNode) it.next();
+        colExpr = new ASTNode(new CommonToken(HiveParser.TOK_TABLE_OR_COL, "TOK_TABLE_OR_COL"));
+        colExpr.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, colAlias)));
+        replaceASTChild(node, colExpr);
+      }
+      num++;
+    }
+    ASTNode insert = new ASTNode(new CommonToken(HiveParser.TOK_INSERT, "TOK_INSERT"));
+    insert.addChild(destination);
+    insert.addChild(select);
+    ASTNode newQuery = new ASTNode(new CommonToken(HiveParser.TOK_QUERY, "TOK_QUERY"));
+    newQuery.addChild(from);
+    newQuery.addChild(insert);
+    // 3. create subquery
+    ASTNode subq = new ASTNode(new CommonToken(HiveParser.TOK_SUBQUERY, "TOK_SUBQUERY"));
+    subq.addChild(newQuery);
+    subq.addChild(new ASTNode(new CommonToken(HiveParser.Identifier, "subq")));
+    replaceASTChild(nodeOfInterest, subq);
+    // 4. return subquery
+    return subq;
+  }
+
+  /**
+   * Can CBO handle the given AST?
+   *
+   * @param ast
+   *          Top level AST
+   * @param qb
+   *          top level QB corresponding to the AST
+   * @param cboCtx
+   * @param semAnalyzer
+   * @return boolean
+   *
+   *         Assumption:<br>
+   *         If top level QB is query then everything below it must also be
+   *         Query.
+   */
+  boolean canCBOHandleAst(ASTNode ast, QB qb, PreCboCtx cboCtx) {
+    int root = ast.getToken().getType();
+    boolean needToLogMessage = STATIC_LOG.isInfoEnabled();
+    boolean isSupportedRoot = root == HiveParser.TOK_QUERY || root == HiveParser.TOK_EXPLAIN
+        || qb.isCTAS() || qb.isMaterializedView();
+    // Queries without a source table currently are not supported by CBO
+    boolean isSupportedType = (qb.getIsQuery() && !qb.containsQueryWithoutSourceTable())
+        || qb.isCTAS() || qb.isMaterializedView() || cboCtx.type == PreCboCtx.Type.INSERT
+        || cboCtx.type == PreCboCtx.Type.MULTI_INSERT;
+    boolean noBadTokens = HiveCalciteUtil.validateASTForUnsupportedTokens(ast);
+    boolean result = isSupportedRoot && isSupportedType && noBadTokens;
+
+    if (!result) {
+      if (needToLogMessage) {
+        String msg = "";
+        if (!isSupportedRoot) {
+          msg += "doesn't have QUERY or EXPLAIN as root and not a CTAS; ";
+        }
+        if (!isSupportedType) {
+          msg += "is not a query with at least one source table "
+                  + " or there is a subquery without a source table, or CTAS, or insert; ";
+        }
+        if (!noBadTokens) {
+          msg += "has unsupported tokens; ";
+        }
+
+        if (msg.isEmpty()) {
+          msg += "has some unspecified limitations; ";
+        }
+        STATIC_LOG.info("Not invoking CBO because the statement "
+            + msg.substring(0, msg.length() - 2));
+      }
+      return false;
+    }
+    // Now check QB in more detail. canHandleQbForCbo returns null if query can
+    // be handled.
+    String msg = CalcitePlanner.canHandleQbForCbo(queryProperties, conf, true, needToLogMessage, qb);
+    if (msg == null) {
+      return true;
+    }
+    if (needToLogMessage) {
+      STATIC_LOG.info("Not invoking CBO because the statement "
+          + msg.substring(0, msg.length() - 2));
+    }
+    return false;
+  }
+
+  /**
+   * Checks whether Calcite can handle the query.
+   *
+   * @param queryProperties
+   * @param conf
+   * @param topLevelQB
+   *          Does QB corresponds to top most query block?
+   * @param verbose
+   *          Whether return value should be verbose in case of failure.
+   * @return null if the query can be handled; non-null reason string if it
+   *         cannot be.
+   *
+   *         Assumption:<br>
+   *         1. If top level QB is query then everything below it must also be
+   *         Query<br>
+   *         2. Nested Subquery will return false for qbToChk.getIsQuery()
+   */
+  static String canHandleQbForCbo(QueryProperties queryProperties, HiveConf conf,
+      boolean topLevelQB, boolean verbose, QB qb) {
+
+    if (!queryProperties.hasClusterBy() && !queryProperties.hasDistributeBy()
+        && !queryProperties.hasSortBy() && !queryProperties.hasPTF() && !queryProperties.usesScript()
+        && !queryProperties.hasLateralViews()) {
+      // Ok to run CBO.
+      return null;
+    }
+
+    // Not ok to run CBO, build error message.
+    String msg = "";
+    if (verbose) {
+      if (queryProperties.hasClusterBy())
+        msg += "has cluster by; ";
+      if (queryProperties.hasDistributeBy())
+        msg += "has distribute by; ";
+      if (queryProperties.hasSortBy())
+        msg += "has sort by; ";
+      if (queryProperties.hasPTF())
+        msg += "has PTF; ";
+      if (queryProperties.usesScript())
+        msg += "uses scripts; ";
+      if (queryProperties.hasLateralViews())
+        msg += "has lateral views; ";
+
+      if (msg.isEmpty())
+        msg += "has some unspecified limitations; ";
+    }
+    return msg;
+  }
+
+  /* This method inserts the right profiles into profiles CBO depending
+   * on the query characteristics. */
+  private static EnumSet<ExtendedCBOProfile> obtainCBOProfiles(QueryProperties queryProperties) {
+    EnumSet<ExtendedCBOProfile> profilesCBO = EnumSet.noneOf(ExtendedCBOProfile.class);
+    // If the query contains more than one join
+    if (queryProperties.getJoinCount() > 1) {
+      profilesCBO.add(ExtendedCBOProfile.JOIN_REORDERING);
+    }
+    // If the query contains windowing processing
+    if (queryProperties.hasWindowing()) {
+      profilesCBO.add(ExtendedCBOProfile.WINDOWING_POSTPROCESSING);
+    }
+    return profilesCBO;
+  }
+
+  @Override
+  boolean isCBOExecuted() {
+    return runCBO;
+  }
+
+  @Override
+  boolean continueJoinMerge() {
+    return !(runCBO && disableSemJoinReordering);
+  }
+
+  @Override
+  Table materializeCTE(String cteName, CTEClause cte) throws HiveException {
+
+    ASTNode createTable = new ASTNode(new ClassicToken(HiveParser.TOK_CREATETABLE));
+
+    ASTNode tableName = new ASTNode(new ClassicToken(HiveParser.TOK_TABNAME));
+    tableName.addChild(new ASTNode(new ClassicToken(HiveParser.Identifier, cteName)));
+
+    ASTNode temporary = new ASTNode(new ClassicToken(HiveParser.KW_TEMPORARY, MATERIALIZATION_MARKER));
+
+    createTable.addChild(tableName);
+    createTable.addChild(temporary);
+    createTable.addChild(cte.cteNode);
+
+    CalcitePlanner analyzer = new CalcitePlanner(queryState);
+    analyzer.initCtx(ctx);
+    analyzer.init(false);
+
+    // should share cte contexts
+    analyzer.aliasToCTEs.putAll(aliasToCTEs);
+
+    HiveOperation operation = queryState.getHiveOperation();
+    try {
+      analyzer.analyzeInternal(createTable);
+    } finally {
+      queryState.setCommandType(operation);
+    }
+
+    Table table = analyzer.tableDesc.toTable(conf);
+    Path location = table.getDataLocation();
+    try {
+      location.getFileSystem(conf).mkdirs(location);
+    } catch (IOException e) {
+      throw new HiveException(e);
+    }
+    table.setMaterializedTable(true);
+
+    LOG.info(cteName + " will be materialized into " + location);
+    cte.table = table;
+    cte.source = analyzer;
+
+    ctx.addMaterializedTable(cteName, table);
+    // For CalcitePlanner, store qualified name too
+    ctx.addMaterializedTable(table.getDbName() + "." + table.getTableName(), table);
+
+    return table;
+  }
+
+  @Override
+  String fixCtasColumnName(String colName) {
+    if (runCBO) {
+      int lastDot = colName.lastIndexOf('.');
+      if (lastDot < 0)
+        return colName; // alias is not fully qualified
+      String nqColumnName = colName.substring(lastDot + 1);
+      STATIC_LOG.debug("Replacing " + colName + " (produced by CBO) by " + nqColumnName);
+      return nqColumnName;
+    }
+
+    return super.fixCtasColumnName(colName);
+  }
+
+  /**
+   * The context that doPhase1 uses to populate information pertaining to CBO
+   * (currently, this is used for CTAS and insert-as-select).
+   */
+  static class PreCboCtx extends PlannerContext {
+    enum Type {
+      NONE, INSERT, MULTI_INSERT, CTAS, VIEW, UNEXPECTED
+    }
+
+    private ASTNode nodeOfInterest;
+    private Type    type = Type.NONE;
+
+    private void set(Type type, ASTNode ast) {
+      if (this.type != Type.NONE) {
+        STATIC_LOG.warn("Setting " + type + " when already " + this.type + "; node " + ast.dump()
+            + " vs old node " + nodeOfInterest.dump());
+        this.type = Type.UNEXPECTED;
+        return;
+      }
+      this.type = type;
+      this.nodeOfInterest = ast;
+    }
+
+    @Override
+    void setCTASToken(ASTNode child) {
+      set(PreCboCtx.Type.CTAS, child);
+    }
+
+    @Override
+    void setViewToken(ASTNode child) {
+      set(PreCboCtx.Type.VIEW, child);
+    }
+
+    @Override
+    void setInsertToken(ASTNode ast, boolean isTmpFileDest) {
+      if (!isTmpFileDest) {
+        set(PreCboCtx.Type.INSERT, ast);
+      }
+    }
+
+    @Override
+    void setMultiInsertToken(ASTNode child) {
+      set(PreCboCtx.Type.MULTI_INSERT, child);
+    }
+
+    @Override
+    void resetToken() {
+      this.type = Type.NONE;
+      this.nodeOfInterest = null;
+    }
+  }
+
+  ASTNode fixUpAfterCbo(ASTNode originalAst, ASTNode newAst, PreCboCtx cboCtx)
+      throws SemanticException {
+    switch (cboCtx.type) {
+
+    case NONE:
+      // nothing to do
+      return newAst;
+
+    case CTAS:
+    case VIEW: {
+      // Patch the optimized query back into original CTAS AST, replacing the
+      // original query.
+      replaceASTChild(cboCtx.nodeOfInterest, newAst);
+      return originalAst;
+    }
+
+    case INSERT: {
+      // We need to patch the dest back to original into new query.
+      // This makes assumptions about the structure of the AST.
+      ASTNode newDest = new ASTSearcher().simpleBreadthFirstSearch(newAst, HiveParser.TOK_QUERY,
+          HiveParser.TOK_INSERT, HiveParser.TOK_DESTINATION);
+      if (newDest == null) {
+        LOG.error("Cannot find destination after CBO; new ast is " + newAst.dump());
+        throw new SemanticException("Cannot find destination after CBO");
+      }
+      replaceASTChild(newDest, cboCtx.nodeOfInterest);
+      return newAst;
+    }
+
+    case MULTI_INSERT: {
+      // Patch the optimized query back into original FROM clause.
+      replaceASTChild(cboCtx.nodeOfInterest, newAst);
+      return originalAst;
+    }
+
+    default:
+      throw new AssertionError("Unexpected type " + cboCtx.type);
+    }
+  }
+
+  ASTNode reAnalyzeCTASAfterCbo(ASTNode newAst) throws SemanticException {
+    // analyzeCreateTable uses this.ast, but doPhase1 doesn't, so only reset it
+    // here.
+    newAst = analyzeCreateTable(newAst, getQB(), null);
+    if (newAst == null) {
+      LOG.error("analyzeCreateTable failed to initialize CTAS after CBO;" + " new ast is "
+          + getAST().dump());
+      throw new SemanticException("analyzeCreateTable failed to initialize CTAS after CBO");
+    }
+    return newAst;
+  }
+
+  ASTNode reAnalyzeViewAfterCbo(ASTNode newAst) throws SemanticException {
+    // analyzeCreateView uses this.ast, but doPhase1 doesn't, so only reset it
+    // here.
+    newAst = analyzeCreateView(newAst, getQB(), null);
+    if (newAst == null) {
+      LOG.error("analyzeCreateTable failed to initialize materialized view after CBO;" + " new ast is "
+          + getAST().dump());
+      throw new SemanticException("analyzeCreateTable failed to initialize materialized view after CBO");
+    }
+    return newAst;
+  }
+
+
+  public static class ASTSearcher {
+    private final LinkedList<ASTNode> searchQueue = new LinkedList<ASTNode>();
+
+    /**
+     * Performs breadth-first search of the AST for a nested set of tokens. Tokens
+     * don't have to be each others' direct children, they can be separated by
+     * layers of other tokens. For each token in the list, the first one found is
+     * matched and there's no backtracking; thus, if AST has multiple instances of
+     * some token, of which only one matches, it is not guaranteed to be found. We
+     * use this for simple things. Not thread-safe - reuses searchQueue.
+     */
+    public ASTNode simpleBreadthFirstSearch(ASTNode ast, int... tokens) {
+      searchQueue.clear();
+      searchQueue.add(ast);
+      for (int i = 0; i < tokens.length; ++i) {
+        boolean found = false;
+        int token = tokens[i];
+        while (!searchQueue.isEmpty() && !found) {
+          ASTNode next = searchQueue.poll();
+          found = next.getType() == token;
+          if (found) {
+            if (i == tokens.length - 1)
+              return next;
+            searchQueue.clear();
+          }
+          for (int j = 0; j < next.getChildCount(); ++j) {
+            searchQueue.add((ASTNode) next.getChild(j));
+          }
+        }
+        if (!found)
+          return null;
+      }
+      return null;
+    }
+
+    public ASTNode depthFirstSearch(ASTNode ast, int token) {
+      searchQueue.clear();
+      searchQueue.add(ast);
+      while (!searchQueue.isEmpty()) {
+        ASTNode next = searchQueue.poll();
+        if (next.getType() == token) return next;
+        for (int j = 0; j < next.getChildCount(); ++j) {
+          searchQueue.add((ASTNode) next.getChild(j));
+        }
+      }
+      return null;
+    }
+
+    public ASTNode simpleBreadthFirstSearchAny(ASTNode ast, int... tokens) {
+      searchQueue.clear();
+      searchQueue.add(ast);
+      while (!searchQueue.isEmpty()) {
+        ASTNode next = searchQueue.poll();
+        for (int i = 0; i < tokens.length; ++i) {
+          if (next.getType() == tokens[i]) return next;
+        }
+        for (int i = 0; i < next.getChildCount(); ++i) {
+          searchQueue.add((ASTNode) next.getChild(i));
+        }
+      }
+      return null;
+    }
+
+    public void reset() {
+      searchQueue.clear();
+    }
+  }
+
+  private static void replaceASTChild(ASTNode child, ASTNode newChild) {
+    ASTNode parent = (ASTNode) child.parent;
+    int childIndex = child.childIndex;
+    parent.deleteChild(childIndex);
+    parent.insertChild(childIndex, newChild);
+  }
+
+  /**
+   * Get optimized logical plan for the given QB tree in the semAnalyzer.
+   *
+   * @return
+   * @throws SemanticException
+   */
+  RelNode logicalPlan() throws SemanticException {
+    RelNode optimizedOptiqPlan = null;
+
+    CalcitePlannerAction calcitePlannerAction = null;
+    if (this.columnAccessInfo == null) {
+      this.columnAccessInfo = new ColumnAccessInfo();
+    }
+    calcitePlannerAction = new CalcitePlannerAction(prunedPartitions, this.columnAccessInfo);
+
+    try {
+      optimizedOptiqPlan = Frameworks.withPlanner(calcitePlannerAction, Frameworks
+          .newConfigBuilder().typeSystem(new HiveTypeSystemImpl()).build());
+    } catch (Exception e) {
+      rethrowCalciteException(e);
+      throw new AssertionError("rethrowCalciteException didn't throw for " + e.getMessage());
+    }
+    return optimizedOptiqPlan;
+  }
+
+  /**
+   * Get Optimized AST for the given QB tree in the semAnalyzer.
+   *
+   * @return Optimized operator tree translated in to Hive AST
+   * @throws SemanticException
+   */
+  ASTNode getOptimizedAST() throws SemanticException {
+    RelNode optimizedOptiqPlan = logicalPlan();
+    ASTNode optiqOptimizedAST = ASTConverter.convert(optimizedOptiqPlan, resultSchema,
+            HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_COLUMN_ALIGNMENT));
+    return optiqOptimizedAST;
+  }
+
+  /**
+   * Get Optimized Hive Operator DAG for the given QB tree in the semAnalyzer.
+   *
+   * @return Optimized Hive operator tree
+   * @throws SemanticException
+   */
+  Operator getOptimizedHiveOPDag() throws SemanticException {
+    RelNode optimizedOptiqPlan = null;
+    CalcitePlannerAction calcitePlannerAction = null;
+    if (this.columnAccessInfo == null) {
+      this.columnAccessInfo = new ColumnAccessInfo();
+    }
+    calcitePlannerAction = new CalcitePlannerAction(prunedPartitions, this.columnAccessInfo);
+
+    try {
+      optimizedOptiqPlan = Frameworks.withPlanner(calcitePlannerAction, Frameworks
+          .newConfigBuilder().typeSystem(new HiveTypeSystemImpl()).build());
+    } catch (Exception e) {
+      rethrowCalciteException(e);
+      throw new AssertionError("rethrowCalciteException didn't throw for " + e.getMessage());
+    }
+
+    RelNode modifiedOptimizedOptiqPlan = PlanModifierForReturnPath.convertOpTree(
+        optimizedOptiqPlan, resultSchema, this.getQB().getTableDesc() != null);
+
+    LOG.debug("Translating the following plan:\n" + RelOptUtil.toString(modifiedOptimizedOptiqPlan));
+    Operator<?> hiveRoot = new HiveOpConverter(this, conf, unparseTranslator, topOps)
+                                  .convert(modifiedOptimizedOptiqPlan);
+    RowResolver hiveRootRR = genRowResolver(hiveRoot, getQB());
+    opParseCtx.put(hiveRoot, new OpParseContext(hiveRootRR));
+    String dest = getQB().getParseInfo().getClauseNames().iterator().next();
+    if (getQB().getParseInfo().getDestSchemaForClause(dest) != null
+        && this.getQB().getTableDesc() == null) {
+      Operator<?> selOp = handleInsertStatement(dest, hiveRoot, hiveRootRR, getQB());
+      return genFileSinkPlan(dest, getQB(), selOp);
+    } else {
+      return genFileSinkPlan(dest, getQB(), hiveRoot);
+    }
+  }
+
+  // This function serves as the wrapper of handleInsertStatementSpec in
+  // SemanticAnalyzer
+  Operator<?> handleInsertStatement(String dest, Operator<?> input, RowResolver inputRR, QB qb)
+      throws SemanticException {
+    ArrayList<ExprNodeDesc> colList = new ArrayList<ExprNodeDesc>();
+    ArrayList<ColumnInfo> columns = inputRR.getColumnInfos();
+    for (int i = 0; i < columns.size(); i++) {
+      ColumnInfo col = columns.get(i);
+      colList.add(new ExprNodeColumnDesc(col));
+    }
+    ASTNode selExprList = qb.getParseInfo().getSelForClause(dest);
+
+    RowResolver out_rwsch = handleInsertStatementSpec(colList, dest, inputRR, inputRR, qb,
+        selExprList);
+
+    ArrayList<String> columnNames = new ArrayList<String>();
+    Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
+    for (int i = 0; i < colList.size(); i++) {
+      String outputCol = getColumnInternalName(i);
+      colExprMap.put(outputCol, colList.get(i));
+      columnNames.add(outputCol);
+    }
+    Operator<?> output = putOpInsertMap(OperatorFactory.getAndMakeChild(new SelectDesc(colList,
+        columnNames), new RowSchema(out_rwsch.getColumnInfos()), input), out_rwsch);
+    output.setColumnExprMap(colExprMap);
+    return output;
+  }
+
+  /***
+   * Unwraps Calcite Invocation exceptions coming meta data provider chain and
+   * obtains the real cause.
+   *
+   * @param Exception
+   */
+  private void rethrowCalciteException(Exception e) throws SemanticException {
+    Throwable first = (semanticException != null) ? semanticException : e, current = first, cause = current
+        .getCause();
+    while (cause != null) {
+      Throwable causeOfCause = cause.getCause();
+      if (current == first && causeOfCause == null && isUselessCause(first)) {
+        // "cause" is a root cause, and "e"/"first" is a useless
+        // exception it's wrapped in.
+        first = cause;
+        break;
+      } else if (causeOfCause != null && isUselessCause(cause)
+          && ExceptionHelper.resetCause(current, causeOfCause)) {
+        // "cause" was a useless intermediate cause and was replace it
+        // with its own cause.
+        cause = causeOfCause;
+        continue; // do loop once again with the new cause of "current"
+      }
+      current = cause;
+      cause = current.getCause();
+    }
+
+    if (first instanceof RuntimeException) {
+      throw (RuntimeException) first;
+    } else if (first instanceof SemanticException) {
+      throw (SemanticException) first;
+    }
+    throw new RuntimeException(first);
+  }
+
+  private static class ExceptionHelper {
+    private static final Field CAUSE_FIELD = getField(Throwable.class, "cause"),
+        TARGET_FIELD = getField(InvocationTargetException.class, "target"),
+        MESSAGE_FIELD = getField(Throwable.class, "detailMessage");
+
+    private static Field getField(Class<?> clazz, String name) {
+      try {
+        Field f = clazz.getDeclaredField(name);
+        f.setAccessible(true);
+        return f;
+      } catch (Throwable t) {
+        return null;
+      }
+    }
+
+    public static boolean resetCause(Throwable target, Throwable newCause) {
+      try {
+        if (MESSAGE_FIELD == null)
+          return false;
+        Field field = (target instanceof InvocationTargetException) ? TARGET_FIELD : CAUSE_FIELD;
+        if (field == null)
+          return false;
+
+        Throwable oldCause = target.getCause();
+        String oldMsg = target.getMessage();
+        field.set(target, newCause);
+        if (oldMsg != null && oldMsg.equals(oldCause.toString())) {
+          MESSAGE_FIELD.set(target, newCause == null ? null : newCause.toString());
+        }
+      } catch (Throwable se) {
+        return false;
+      }
+      return true;
+    }
+  }
+
+  private boolean isUselessCause(Throwable t) {
+    return t instanceof RuntimeException || t instanceof InvocationTargetException
+        || t instanceof UndeclaredThrowableException;
+  }
+
+  private RowResolver genRowResolver(Operator op, QB qb) {
+    RowResolver rr = new RowResolver();
+    String subqAlias = (qb.getAliases().size() == 1 && qb.getSubqAliases().size() == 1) ? qb
+        .getAliases().get(0) : null;
+
+    for (ColumnInfo ci : op.getSchema().getSignature()) {
+      try {
+        rr.putWithCheck((subqAlias != null) ? subqAlias : ci.getTabAlias(),
+            ci.getAlias() != null ? ci.getAlias() : ci.getInternalName(), ci.getInternalName(),
+            new ColumnInfo(ci));
+      } catch (SemanticException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    return rr;
+  }
+
+  private enum ExtendedCBOProfile {
+    JOIN_REORDERING,
+    WINDOWING_POSTPROCESSING;
+  }
+
+  /**
+   * Code responsible for Calcite plan generation and optimization.
+   */
+  private class CalcitePlannerAction implements Frameworks.PlannerAction<RelNode> {
+    private RelOptCluster                                 cluster;
+    private RelOptSchema                                  relOptSchema;
+    private final Map<String, PrunedPartitionList>        partitionCache;
+    private final ColumnAccessInfo columnAccessInfo;
+    private Map<HiveProject, Table> viewProjectToTableSchema;
+
+    //correlated vars across subqueries within same query needs to have different ID
+    // this will be used in RexNodeConverter to create cor var
+    private int subqueryId;
+
+    // this is to keep track if a subquery is correlated and contains aggregate
+    // since this is special cased when it is rewritten in SubqueryRemoveRule
+    Set<RelNode> corrScalarRexSQWithAgg = new HashSet<RelNode>();
+
+    // TODO: Do we need to keep track of RR, ColNameToPosMap for every op or
+    // just last one.
+    LinkedHashMap<RelNode, RowResolver>                   relToHiveRR                   = new LinkedHashMap<RelNode, RowResolver>();
+    LinkedHashMap<RelNode, ImmutableMap<String, Integer>> relToHiveColNameCalcitePosMap = new LinkedHashMap<RelNode, ImmutableMap<String, Integer>>();
+
+    CalcitePlannerAction(Map<String, PrunedPartitionList> partitionCache, ColumnAccessInfo columnAccessInfo) {
+      this.partitionCache = partitionCache;
+      this.columnAccessInfo = columnAccessInfo;
+    }
+
+    @Override
+    public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlus rootSchema) {
+      RelNode calciteGenPlan = null;
+      RelNode calcitePreCboPlan = null;
+      RelNode calciteOptimizedPlan = null;
+      subqueryId = 0;
+
+      /*
+       * recreate cluster, so that it picks up the additional traitDef
+       */
+      final Double maxSplitSize = (double) HiveConf.getLongVar(
+              conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE);
+      final Double maxMemory = (double) HiveConf.getLongVar(
+              conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
+      HiveAlgorithmsConf algorithmsConf = new HiveAlgorithmsConf(maxSplitSize, maxMemory);
+      HiveRulesRegistry registry = new HiveRulesRegistry();
+      Properties calciteConfigProperties = new Properties();
+      calciteConfigProperties.setProperty(
+              CalciteConnectionProperty.MATERIALIZATIONS_ENABLED.camelName(),
+              Boolean.FALSE.toString());
+      CalciteConnectionConfig calciteConfig = new CalciteConnectionConfigImpl(calciteConfigProperties);
+      HivePlannerContext confContext = new HivePlannerContext(algorithmsConf, registry, calciteConfig,
+                 corrScalarRexSQWithAgg);
+      RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(confContext);
+      final RexBuilder rexBuilder = cluster.getRexBuilder();
+      final RelOptCluster optCluster = RelOptCluster.create(planner, rexBuilder);
+
+      this.cluster = optCluster;
+      this.relOptSchema = relOptSchema;
+
+      PerfLogger perfLogger = SessionState.getPerfLogger();
+
+      // 1. Gen Calcite Plan
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      try {
+        calciteGenPlan = genLogicalPlan(getQB(), true, null, null);
+        // if it is to create view, we do not use table alias
+        resultSchema = SemanticAnalyzer.convertRowSchemaToResultSetSchema(
+            relToHiveRR.get(calciteGenPlan),
+            getQB().isView() ? false : HiveConf.getBoolVar(conf,
+                HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
+      } catch (SemanticException e) {
+        semanticException = e;
+        throw new RuntimeException(e);
+      }
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Plan generation");
+
+      // Create executor
+      RexExecutor executorProvider = new HiveRexExecutorImpl(optCluster);
+      calciteGenPlan.getCluster().getPlanner().setExecutor(executorProvider);
+
+      // We need to get the ColumnAccessInfo and viewToTableSchema for views.
+      HiveRelFieldTrimmer fieldTrimmer = new HiveRelFieldTrimmer(null,
+          HiveRelFactories.HIVE_BUILDER.create(optCluster, null), this.columnAccessInfo,
+          this.viewProjectToTableSchema);
+
+      fieldTrimmer.trim(calciteGenPlan);
+
+      // Create and set MD provider
+      HiveDefaultRelMetadataProvider mdProvider = new HiveDefaultRelMetadataProvider(conf);
+      RelMetadataQuery.THREAD_PROVIDERS.set(
+              JaninoRelMetadataProvider.of(mdProvider.getMetadataProvider()));
+
+      //Remove subquery
+      LOG.debug("Plan before removing subquery:\n" + RelOptUtil.toString(calciteGenPlan));
+      calciteGenPlan = hepPlan(calciteGenPlan, false, mdProvider.getMetadataProvider(), null,
+              HiveSubQueryRemoveRule.REL_NODE);
+      LOG.debug("Plan just after removing subquery:\n" + RelOptUtil.toString(calciteGenPlan));
+
+      calciteGenPlan = HiveRelDecorrelator.decorrelateQuery(calciteGenPlan);
+      LOG.debug("Plan after decorrelation:\n" + RelOptUtil.toString(calciteGenPlan));
+
+      // 2. Apply pre-join order optimizations
+      calcitePreCboPlan = applyPreJoinOrderingTransforms(calciteGenPlan,
+              mdProvider.getMetadataProvider(), executorProvider);
+
+      // 3. Apply join order optimizations: reordering MST algorithm
+      //    If join optimizations failed because of missing stats, we continue with
+      //    the rest of optimizations
+      if (profilesCBO.contains(ExtendedCBOProfile.JOIN_REORDERING)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        try {
+          List<RelMetadataProvider> list = Lists.newArrayList();
+          list.add(mdProvider.getMetadataProvider());
+          RelTraitSet desiredTraits = optCluster
+              .traitSetOf(HiveRelNode.CONVENTION, RelCollations.EMPTY);
+
+          HepProgramBuilder hepPgmBldr = new HepProgramBuilder().addMatchOrder(HepMatchOrder.BOTTOM_UP);
+          hepPgmBldr.addRuleInstance(new JoinToMultiJoinRule(HiveJoin.class));
+          hepPgmBldr.addRuleInstance(new LoptOptimizeJoinRule(HiveRelFactories.HIVE_BUILDER));
+
+          HepProgram hepPgm = hepPgmBldr.build();
+          HepPlanner hepPlanner = new HepPlanner(hepPgm);
+
+          hepPlanner.registerMetadataProviders(list);
+          RelMetadataProvider chainedProvider = ChainedRelMetadataProvider.of(list);
+          optCluster.setMetadataProvider(new CachingRelMetadataProvider(chainedProvider, hepPlanner));
+
+          RelNode rootRel = calcitePreCboPlan;
+          hepPlanner.setRoot(rootRel);
+          if (!calcitePreCboPlan.getTraitSet().equals(desiredTraits)) {
+            rootRel = hepPlanner.changeTraits(calcitePreCboPlan, desiredTraits);
+          }
+          hepPlanner.setRoot(rootRel);
+
+          calciteOptimizedPlan = hepPlanner.findBestExp();
+        } catch (Exception e) {
+          boolean isMissingStats = noColsMissingStats.get() > 0;
+          if (isMissingStats) {
+            LOG.warn("Missing column stats (see previous messages), skipping join reordering in CBO");
+            noColsMissingStats.set(0);
+            calciteOptimizedPlan = calcitePreCboPlan;
+            disableSemJoinReordering = false;
+          } else {
+            throw e;
+          }
+        }
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Join Reordering");
+      } else {
+        calciteOptimizedPlan = calcitePreCboPlan;
+        disableSemJoinReordering = false;
+      }
+
+      // 4. Run other optimizations that do not need stats
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+              HepMatchOrder.BOTTOM_UP, ProjectRemoveRule.INSTANCE, UnionMergeRule.INSTANCE,
+              HiveProjectMergeRule.INSTANCE_NO_FORCE, HiveAggregateProjectMergeRule.INSTANCE,
+              HiveJoinCommuteRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Optimizations without stats");
+
+      // 5. Materialized view based rewriting
+      // We disable it for CTAS and MV creation queries (trying to avoid any problem
+      // due to data freshness)
+      if (conf.getBoolVar(ConfVars.HIVE_MATERIALIZED_VIEW_ENABLE_AUTO_REWRITING) &&
+              !getQB().isMaterializedView() && !getQB().isCTAS()) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        // Use Calcite cost model for view rewriting
+        RelMetadataProvider calciteMdProvider = DefaultRelMetadataProvider.INSTANCE;
+        RelMetadataQuery.THREAD_PROVIDERS.set(JaninoRelMetadataProvider.of(calciteMdProvider));
+        planner.registerMetadataProviders(Lists.newArrayList(calciteMdProvider));
+        // Add views to planner
+        List<RelOptMaterialization> materializations = new ArrayList<>();
+        try {
+          materializations = Hive.get().getRewritingMaterializedViews();
+          // We need to use the current cluster for the scan operator on views,
+          // otherwise the planner will throw an Exception (different planners)
+          materializations = Lists.transform(materializations,
+              new Function<RelOptMaterialization, RelOptMaterialization>() {
+                @Override
+                public RelOptMaterialization apply(RelOptMaterialization materialization) {
+                  final RelNode viewScan = materialization.tableRel;
+                  final RelNode newViewScan;
+                  if (viewScan instanceof DruidQuery) {
+                    final DruidQuery dq = (DruidQuery) viewScan;
+                    newViewScan = DruidQuery.create(optCluster, optCluster.traitSetOf(HiveRelNode.CONVENTION),
+                        viewScan.getTable(), dq.getDruidTable(),
+                        ImmutableList.<RelNode>of(dq.getTableScan()));
+                  } else {
+                    newViewScan = new HiveTableScan(optCluster, optCluster.traitSetOf(HiveRelNode.CONVENTION),
+                        (RelOptHiveTable) viewScan.getTable(), viewScan.getTable().getQualifiedName().get(0),
+                        null, false, false);
+                  }
+                  return new RelOptMaterialization(newViewScan, materialization.queryRel, null);
+                }
+              }
+          );
+        } catch (HiveException e) {
+          LOG.warn("Exception loading materialized views", e);
+        }
+        if (!materializations.isEmpty()) {
+          for (RelOptMaterialization materialization : materializations) {
+            planner.addMaterialization(materialization);
+          }
+          // Add view-based rewriting rules to planner
+          planner.addRule(HiveMaterializedViewFilterScanRule.INSTANCE);
+          // Optimize plan
+          planner.setRoot(calciteOptimizedPlan);
+          calciteOptimizedPlan = planner.findBestExp();
+          // Remove view-based rewriting rules from planner
+          planner.clear();
+        }
+        // Restore default cost model
+        RelMetadataQuery.THREAD_PROVIDERS.set(JaninoRelMetadataProvider.of(mdProvider.getMetadataProvider()));
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: View-based rewriting");
+      }
+
+      // 6. Run aggregate-join transpose (cost based)
+      //    If it failed because of missing stats, we continue with
+      //    the rest of optimizations
+      if (conf.getBoolVar(ConfVars.AGGR_JOIN_TRANSPOSE)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        try {
+          calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+                  HepMatchOrder.BOTTOM_UP, HiveAggregateJoinTransposeRule.INSTANCE);
+        } catch (Exception e) {
+          boolean isMissingStats = noColsMissingStats.get() > 0;
+          if (isMissingStats) {
+            LOG.warn("Missing column stats (see previous messages), skipping aggregate-join transpose in CBO");
+            noColsMissingStats.set(0);
+          } else {
+            throw e;
+          }
+        }
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Aggregate join transpose");
+      }
+
+      // 7.convert Join + GBy to semijoin
+      // run this rule at later stages, since many calcite rules cant deal with semijoin
+      if (conf.getBoolVar(ConfVars.SEMIJOIN_CONVERSION)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null, HiveSemiJoinRule.INSTANCE);
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Semijoin conversion");
+      }
+
+
+      // 8. Run rule to fix windowing issue when it is done over
+      // aggregation columns (HIVE-10627)
+      if (profilesCBO.contains(ExtendedCBOProfile.WINDOWING_POSTPROCESSING)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+                HepMatchOrder.BOTTOM_UP, HiveWindowingFixRule.INSTANCE);
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Window fixing rule");
+      }
+
+      // 9. Apply Druid transformation rules
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+              HepMatchOrder.BOTTOM_UP, DruidRules.FILTER, DruidRules.AGGREGATE_PROJECT,
+              DruidRules.PROJECT, DruidRules.AGGREGATE, DruidRules.SORT_PROJECT_TRANSPOSE,
+              DruidRules.SORT, DruidRules.PROJECT_SORT_TRANSPOSE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Druid transformation rules");
+
+      // 10. Run rules to aid in translation from Calcite tree to Hive tree
+      if (HiveConf.getBoolVar(conf, ConfVars.HIVE_CBO_RETPATH_HIVEOP)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        // 10.1. Merge join into multijoin operators (if possible)
+        calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, true, mdProvider.getMetadataProvider(), null,
+                HepMatchOrder.BOTTOM_UP, HiveJoinProjectTransposeRule.BOTH_PROJECT_INCLUDE_OUTER,
+                HiveJoinProjectTransposeRule.LEFT_PROJECT_INCLUDE_OUTER,
+                HiveJoinProjectTransposeRule.RIGHT_PROJECT_INCLUDE_OUTER,
+                HiveJoinToMultiJoinRule.INSTANCE, HiveProjectMergeRule.INSTANCE);
+        // The previous rules can pull up projections through join operators,
+        // thus we run the field trimmer again to push them back down
+        fieldTrimmer = new HiveRelFieldTrimmer(null,
+            HiveRelFactories.HIVE_BUILDER.create(optCluster, null));
+        calciteOptimizedPlan = fieldTrimmer.trim(calciteOptimizedPlan);
+        calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+                HepMatchOrder.BOTTOM_UP, ProjectRemoveRule.INSTANCE,
+                new ProjectMergeRule(false, HiveRelFactories.HIVE_BUILDER));
+        calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, true, mdProvider.getMetadataProvider(), null,
+                HiveFilterProjectTSTransposeRule.INSTANCE, HiveFilterProjectTSTransposeRule.INSTANCE_DRUID,
+                HiveProjectFilterPullUpConstantsRule.INSTANCE);
+
+        // 10.2.  Introduce exchange operators below join/multijoin operators
+        calciteOptimizedPlan = hepPlan(calciteOptimizedPlan, false, mdProvider.getMetadataProvider(), null,
+                HepMatchOrder.BOTTOM_UP, HiveInsertExchange4JoinRule.EXCHANGE_BELOW_JOIN,
+                HiveInsertExchange4JoinRule.EXCHANGE_BELOW_MULTIJOIN);
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER, "Calcite: Translation from Calcite tree to Hive tree");
+      }
+
+      if (LOG.isDebugEnabled() && !conf.getBoolVar(ConfVars.HIVE_IN_TEST)) {
+        LOG.debug("CBO Planning details:\n");
+        LOG.debug("Original Plan:\n" + RelOptUtil.toString(calciteGenPlan));
+        LOG.debug("Plan After PPD, PartPruning, ColumnPruning:\n"
+            + RelOptUtil.toString(calcitePreCboPlan));
+        LOG.debug("Plan After Join Reordering:\n"
+            + RelOptUtil.toString(calciteOptimizedPlan, SqlExplainLevel.ALL_ATTRIBUTES));
+      }
+
+      return calciteOptimizedPlan;
+    }
+
+    /**
+     * Perform all optimizations before Join Ordering.
+     *
+     * @param basePlan
+     *          original plan
+     * @param mdProvider
+     *          meta data provider
+     * @param executorProvider
+     *          executor
+     * @return
+     */
+    private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProvider mdProvider, RexExecutor executorProvider) {
+      // TODO: Decorelation of subquery should be done before attempting
+      // Partition Pruning; otherwise Expression evaluation may try to execute
+      // corelated sub query.
+
+      PerfLogger perfLogger = SessionState.getPerfLogger();
+
+      final int maxCNFNodeCount = conf.getIntVar(HiveConf.ConfVars.HIVE_CBO_CNF_NODES_LIMIT);
+      final int minNumORClauses = conf.getIntVar(HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZERMIN);
+
+      //0. SetOp rewrite
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, true, mdProvider, null, HepMatchOrder.BOTTOM_UP,
+          HiveProjectOverIntersectRemoveRule.INSTANCE, HiveIntersectMergeRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+          "Calcite: HiveProjectOverIntersectRemoveRule and HiveIntersectMerge rules");
+
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+          HiveIntersectRewriteRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+          "Calcite: HiveIntersectRewrite rule");
+
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+          HiveExceptRewriteRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+          "Calcite: HiveExceptRewrite rule");
+
+      //1. Distinct aggregate rewrite
+      // Run this optimization early, since it is expanding the operator pipeline.
+      if (!conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr") &&
+          conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEDISTINCTREWRITE)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        // Its not clear, if this rewrite is always performant on MR, since extra map phase
+        // introduced for 2nd MR job may offset gains of this multi-stage aggregation.
+        // We need a cost model for MR to enable this on MR.
+        basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HiveExpandDistinctAggregatesRule.INSTANCE);
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+         "Calcite: Prejoin ordering transformation, Distinct aggregate rewrite");
+      }
+
+      // 2. Try factoring out common filter elements & separating deterministic
+      // vs non-deterministic UDF. This needs to run before PPD so that PPD can
+      // add on-clauses for old style Join Syntax
+      // Ex: select * from R1 join R2 where ((R1.x=R2.x) and R1.y<10) or
+      // ((R1.x=R2.x) and R1.z=10)) and rand(1) < 0.1
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, HepMatchOrder.ARBITRARY,
+          new HivePreFilteringRule(maxCNFNodeCount));
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, factor out common filter elements and separating deterministic vs non-deterministic UDF");
+
+      // 3. Run exhaustive PPD, add not null filters, transitive inference,
+      // constant propagation, constant folding
+      List<RelOptRule> rules = Lists.newArrayList();
+      if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTPPD_WINDOWING)) {
+        rules.add(HiveFilterProjectTransposeRule.INSTANCE_DETERMINISTIC_WINDOWING);
+      } else {
+        rules.add(HiveFilterProjectTransposeRule.INSTANCE_DETERMINISTIC);
+      }
+      rules.add(HiveFilterSetOpTransposeRule.INSTANCE);
+      rules.add(HiveFilterSortTransposeRule.INSTANCE);
+      rules.add(HiveFilterJoinRule.JOIN);
+      rules.add(HiveFilterJoinRule.FILTER_ON_JOIN);
+      rules.add(new HiveFilterAggregateTransposeRule(Filter.class, HiveRelFactories.HIVE_FILTER_FACTORY, Aggregate.class));
+      rules.add(new FilterMergeRule(HiveRelFactories.HIVE_BUILDER));
+      if (conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_REDUCE_WITH_STATS)) {
+        rules.add(HiveReduceExpressionsWithStatsRule.INSTANCE);
+      }
+      rules.add(HiveProjectFilterPullUpConstantsRule.INSTANCE);
+      rules.add(HiveReduceExpressionsRule.PROJECT_INSTANCE);
+      rules.add(HiveReduceExpressionsRule.FILTER_INSTANCE);
+      rules.add(HiveReduceExpressionsRule.JOIN_INSTANCE);
+      if (conf.getBoolVar(HiveConf.ConfVars.HIVEPOINTLOOKUPOPTIMIZER)) {
+        rules.add(new HivePointLookupOptimizerRule.FilterCondition(minNumORClauses));
+        rules.add(new HivePointLookupOptimizerRule.JoinCondition(minNumORClauses));
+      }
+      rules.add(HiveJoinAddNotNullRule.INSTANCE_JOIN);
+      rules.add(HiveJoinAddNotNullRule.INSTANCE_SEMIJOIN);
+      rules.add(HiveJoinPushTransitivePredicatesRule.INSTANCE_JOIN);
+      rules.add(HiveJoinPushTransitivePredicatesRule.INSTANCE_SEMIJOIN);
+      rules.add(HiveSortMergeRule.INSTANCE);
+      rules.add(HiveSortLimitPullUpConstantsRule.INSTANCE);
+      rules.add(HiveUnionPullUpConstantsRule.INSTANCE);
+      rules.add(HiveAggregatePullUpConstantsRule.INSTANCE);
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+              rules.toArray(new RelOptRule[rules.size()]));
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, PPD, not null predicates, transitive inference, constant folding");
+
+      // 4. Push down limit through outer join
+      // NOTE: We run this after PPD to support old style join syntax.
+      // Ex: select * from R1 left outer join R2 where ((R1.x=R2.x) and R1.y<10) or
+      // ((R1.x=R2.x) and R1.z=10)) and rand(1) < 0.1 order by R1.x limit 10
+      if (conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_TRANSPOSE)) {
+        perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+        // This should be a cost based decision, but till we enable the extended cost
+        // model, we will use the given value for the variable
+        final float reductionProportion = HiveConf.getFloatVar(conf,
+            HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_TRANSPOSE_REDUCTION_PERCENTAGE);
+        final long reductionTuples = HiveConf.getLongVar(conf,
+            HiveConf.ConfVars.HIVE_OPTIMIZE_LIMIT_TRANSPOSE_REDUCTION_TUPLES);
+        basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HiveSortMergeRule.INSTANCE,
+            HiveSortProjectTransposeRule.INSTANCE, HiveSortJoinReduceRule.INSTANCE,
+            HiveSortUnionReduceRule.INSTANCE);
+        basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, HepMatchOrder.BOTTOM_UP,
+            new HiveSortRemoveRule(reductionProportion, reductionTuples),
+            HiveProjectSortTransposeRule.INSTANCE);
+        perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+          "Calcite: Prejoin ordering transformation, Push down limit through outer join");
+      }
+
+      // 5. Push Down Semi Joins
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, true, mdProvider, executorProvider, SemiJoinJoinTransposeRule.INSTANCE,
+          SemiJoinFilterTransposeRule.INSTANCE, SemiJoinProjectTransposeRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, Push Down Semi Joins");
+
+      // 6. Apply Partition Pruning
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, false, mdProvider, executorProvider, new HivePartitionPruneRule(conf));
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, Partition Pruning");
+
+      // 7. Projection Pruning (this introduces select above TS & hence needs to be run last due to PP)
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      HiveRelFieldTrimmer fieldTrimmer = new HiveRelFieldTrimmer(null,
+          HiveRelFactories.HIVE_BUILDER.create(cluster, null),
+          profilesCBO.contains(ExtendedCBOProfile.JOIN_REORDERING));
+      basePlan = fieldTrimmer.trim(basePlan);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, Projection Pruning");
+
+      // 8. Merge, remove and reduce Project if possible
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, false, mdProvider, executorProvider,
+           HiveProjectMergeRule.INSTANCE, ProjectRemoveRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, Merge Project-Project");
+
+      // 9. Rerun PPD through Project as column pruning would have introduced
+      // DT above scans; By pushing filter just above TS, Hive can push it into
+      // storage (incase there are filters on non partition cols). This only
+      // matches FIL-PROJ-TS
+      perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
+      basePlan = hepPlan(basePlan, true, mdProvider, executorProvider,
+          HiveFilterProjectTSTransposeRule.INSTANCE, HiveFilterProjectTSTransposeRule.INSTANCE_DRUID,
+          HiveProjectFilterPullUpConstantsRule.INSTANCE);
+      perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.OPTIMIZER,
+        "Calcite: Prejoin ordering transformation, Rerun PPD");
+
+      return basePlan;
+    }
+
+    /**
+     * Run the HEP Planner with the given rule set.
+     *
+     * @param basePlan
+     * @param followPlanChanges
+     * @param mdProvider
+     * @param executorProvider
+     * @param rules
+     * @return optimized RelNode
+     */
+    private RelNode hepPlan(RelNode basePlan, boolean followPlanChanges,
+        RelMetadataProvider mdProvider, RexExecutor executorProvider, RelOptRule... rules) {
+      return hepPlan(basePlan, followPlanChanges, mdProvider, executorProvider,
+              HepMatchOrder.TOP_DOWN, rules);
+    }
+
+    /**
+     * Run the HEP Planner with the given rule set.
+     *
+     * @param basePlan
+     * @param followPlanChanges
+     * @param mdProvider
+     * @param executorProvider
+     * @param order
+     * @param rules
+     * @return optimized RelNode
+     */
+    private RelNode hepPlan(RelNode basePlan, boolean followPlanChanges,
+        RelMetadataProvider mdProvider, RexExecutor executorProvider, HepMatchOrder order,
+        RelOptRule... rules) {
+
+      RelNode optimizedRelNode = basePlan;
+      HepProgramBuilder programBuilder = new HepProgramBuilder();
+      if (followPlanChanges) {
+        programBuilder.addMatchOrder(order);
+        programBuilder = programBuilder.addRuleCollection(ImmutableList.copyOf(rules));
+      } else {
+        // TODO: Should this be also TOP_DOWN?
+        for (RelOptRule r : rules)
+          programBuilder.addRuleInstance(r);
+      }
+
+      // Create planner and copy context
+      HepPlanner planner = new HepPlanner(programBuilder.build(),
+              basePlan.getCluster().getPlanner().getContext());
+
+      List<RelMetadataProvider> list = Lists.newArrayList();
+      list.add(mdProvider);
+      planner.registerMetadataProviders(list);
+      RelMetadataProvider chainedProvider = ChainedRelMetadataProvider.of(list);
+      basePlan.getCluster().setMetadataProvider(
+          new CachingRelMetadataProvider(chainedProvider, planner));
+
+      if (executorProvider != null) {
+        // basePlan.getCluster.getPlanner is the VolcanoPlanner from apply()
+        // both planners need to use the correct executor
+        basePlan.getCluster().getPlanner().setExecutor(executorProvider);
+        planner.setExecutor(executorProvider);
+      }
+
+      planner.setRoot(basePlan);
+      optimizedRelNode = planner.findBestExp();
+
+      return optimizedRelNode;
+    }
+
+    @SuppressWarnings("nls")
+    private RelNode genSetOpLogicalPlan(Opcode opcode, String alias, String leftalias, RelNode leftRel,
+        String rightalias, RelNode rightRel) throws SemanticException {
+      // 1. Get Row Resolvers, Column map for original left and right input of
+      // SetOp Rel
+      RowResolver leftRR = this.relToHiveRR.get(leftRel);
+      RowResolver rightRR = this.relToHiveRR.get(rightRel);
+      HashMap<String, ColumnInfo> leftmap = leftRR.getFieldMap(leftalias);
+      HashMap<String, ColumnInfo> rightmap = rightRR.getFieldMap(rightalias);
+
+      // 2. Validate that SetOp is feasible according to Hive (by using type
+      // info from RR)
+      if (leftmap.size() != rightmap.size()) {
+        throw new SemanticException("Schema of both sides of union should match.");
+      }
+
+      ASTNode tabref = getQB().getAliases().isEmpty() ? null : getQB().getParseInfo()
+          .getSrcForAlias(getQB().getAliases().get(0));
+
+      // 3. construct SetOp Output RR using original left & right Input
+      RowResolver setOpOutRR = new RowResolver();
+
+      Iterator<Map.Entry<String, ColumnInfo>> lIter = leftmap.entrySet().iterator();
+      Iterator<Map.Entry<String, ColumnInfo>> rIter = rightmap.entrySet().iterator();
+      while (lIter.hasNext()) {
+        Map.Entry<String, ColumnInfo> lEntry = lIter.next();
+        Map.Entry<String, ColumnInfo> rEntry = rIter.next();
+        ColumnInfo lInfo = lEntry.getValue();
+        ColumnInfo rInfo = rEntry.getValue();
+
+        String field = lEntry.getKey();
+        // try widening conversion, otherwise fail union
+        TypeInfo commonTypeInfo = FunctionRegistry.getCommonClassForUnionAll(lInfo.getType(),
+            rInfo.getType());
+        if (commonTypeInfo == null) {
+          throw new SemanticException(generateErrorMessage(tabref,
+              "Schema of both sides of setop should match: Column " + field
+                  + " is of type " + lInfo.getType().getTypeName()
+                  + " on first table and type " + rInfo.getType().getTypeName()
+                  + " on second table"));
+        }
+        ColumnInfo setOpColInfo = new ColumnInfo(lInfo);
+        setOpColInfo.setType(commonTypeInfo);
+        setOpOutRR.put(alias, field, setOpColInfo);
+      }
+
+      // 4. Determine which columns requires cast on left/right input (Calcite
+      // requires exact types on both sides of SetOp)
+      boolean leftNeedsTypeCast = false;
+      boolean rightNeedsTypeCast = false;
+      List<RexNode> leftProjs = new ArrayList<RexNode>();
+      List<RexNode> rightProjs = new ArrayList<RexNode>();
+      List<RelDataTypeField> leftRowDT = leftRel.getRowType().getFieldList();
+      List<RelDataTypeField> rightRowDT = rightRel.getRowType().getFieldList();
+
+      RelDataType leftFieldDT;
+      RelDataType rightFieldDT;
+      RelDataType unionFieldDT;
+      for (int i = 0; i < leftRowDT.size(); i++) {
+        leftFieldDT = leftRowDT.get(i).getType();
+        rightFieldDT = rightRowDT.get(i).getType();
+        if (!leftFieldDT.equals(rightFieldDT)) {
+          unionFieldDT = TypeConverter.convert(setOpOutRR.getColumnInfos().get(i).getType(),
+              cluster.getTypeFactory());
+          if (!unionFieldDT.equals(leftFieldDT)) {
+            leftNeedsTypeCast = true;
+          }
+          leftProjs.add(cluster.getRexBuilder().ensureType(unionFieldDT,
+              cluster.getRexBuilder().makeInputRef(leftFieldDT, i), true));
+
+          if (!unionFieldDT.equals(rightFieldDT)) {
+            rightNeedsTypeCast = true;
+          }
+          rightProjs.add(cluster.getRexBuilder().ensureType(unionFieldDT,
+              cluster.getRexBuilder().makeInputRef(rightFieldDT, i), true));
+        } else {
+          leftProjs.add(cluster.getRexBuilder().ensureType(leftFieldDT,
+              cluster.getRexBuilder().makeInputRef(leftFieldDT, i), true));
+          rightProjs.add(cluster.getRexBuilder().ensureType(rightFieldDT,
+              cluster.getRexBuilder().makeInputRef(rightFieldDT, i), true));
+        }
+      }
+
+      // 5. Introduce Project Rel above original left/right inputs if cast is
+      // needed for type parity
+      RelNode setOpLeftInput = leftRel;
+      RelNode setOpRightInput = rightRel;
+      if (leftNeedsTypeCast) {
+        setOpLeftInput = HiveProject.create(leftRel, leftProjs, leftRel.getRowType()
+            .getFieldNames());
+      }
+      if (rightNeedsTypeCast) {
+        setOpRightInput = HiveProject.create(rightRel, rightProjs, rightRel.getRowType()
+            .getFieldNames());
+      }
+
+      // 6. Construct SetOp Rel
+      Builder<RelNode> bldr = new ImmutableList.Builder<RelNode>();
+      bldr.add(setOpLeftInput);
+      bldr.add(setOpRightInput);
+      SetOp setOpRel = null;
+      switch (opcode) {
+      case UNION:
+        setOpRel = new HiveUnion(cluster, TraitsUtil.getDefaultTraitSet(cluster), bldr.build());
+        break;
+      case INTERSECT:
+        setOpRel = new HiveIntersect(cluster, TraitsUtil.getDefaultTraitSet(cluster), bldr.build(),
+            false);
+ 

<TRUNCATED>

[2/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

Posted by gu...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig
new file mode 100644
index 0000000..b5a5645
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig
@@ -0,0 +1,13508 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.security.AccessControlException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Queue;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.UUID;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.antlr.runtime.ClassicToken;
+import org.antlr.runtime.CommonToken;
+import org.antlr.runtime.Token;
+import org.antlr.runtime.tree.Tree;
+import org.antlr.runtime.tree.TreeVisitor;
+import org.antlr.runtime.tree.TreeVisitorAction;
+import org.antlr.runtime.tree.TreeWizard;
+import org.antlr.runtime.tree.TreeWizard.ContextVisitor;
+import org.apache.calcite.rel.RelNode;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.common.ObjectPair;
+import org.apache.hadoop.hive.common.StatsSetupConst;
+import org.apache.hadoop.hive.common.StatsSetupConst.StatDB;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.conf.HiveConf.StrictChecks;
+import org.apache.hadoop.hive.metastore.MetaStoreUtils;
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.Warehouse;
+import org.apache.hadoop.hive.metastore.api.Database;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Order;
+import org.apache.hadoop.hive.metastore.api.SQLForeignKey;
+import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey;
+import org.apache.hadoop.hive.ql.CompilationOpContext;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.QueryProperties;
+import org.apache.hadoop.hive.ql.QueryState;
+import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.ArchiveUtils;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
+import org.apache.hadoop.hive.ql.exec.FetchTask;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.FunctionInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.RecordReader;
+import org.apache.hadoop.hive.ql.exec.RecordWriter;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.exec.UnionOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.hooks.ReadEntity;
+import org.apache.hadoop.hive.ql.hooks.WriteEntity;
+import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
+import org.apache.hadoop.hive.ql.io.AcidUtils.Operation;
+import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
+import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
+import org.apache.hadoop.hive.ql.io.NullRowsInputFormat;
+import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
+import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.metadata.DummyPartition;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.metadata.InvalidTableException;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
+import org.apache.hadoop.hive.ql.optimizer.Optimizer;
+import org.apache.hadoop.hive.ql.optimizer.Transform;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverterPostProc;
+import org.apache.hadoop.hive.ql.optimizer.lineage.Generator;
+import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec.SpecType;
+import org.apache.hadoop.hive.ql.parse.CalcitePlanner.ASTSearcher;
+import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFInputSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFQueryInputSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFQueryInputType;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionedTableFunctionSpec;
+import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitioningSpec;
+import org.apache.hadoop.hive.ql.parse.QBSubQuery.SubQueryType;
+import org.apache.hadoop.hive.ql.parse.SubQueryUtils.ISubQueryJoinInfo;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.Direction;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFrameSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowSpec;
+import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowType;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.AlterTableDesc;
+import org.apache.hadoop.hive.ql.plan.AlterTableDesc.AlterTableTypes;
+import org.apache.hadoop.hive.ql.plan.CreateTableDesc;
+import org.apache.hadoop.hive.ql.plan.CreateTableLikeDesc;
+import org.apache.hadoop.hive.ql.plan.CreateViewDesc;
+import org.apache.hadoop.hive.ql.plan.DDLWork;
+import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
+import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc;
+import org.apache.hadoop.hive.ql.plan.ForwardDesc;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.HiveOperation;
+import org.apache.hadoop.hive.ql.plan.InsertTableDesc;
+import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
+import org.apache.hadoop.hive.ql.plan.JoinDesc;
+import org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc;
+import org.apache.hadoop.hive.ql.plan.LateralViewJoinDesc;
+import org.apache.hadoop.hive.ql.plan.LimitDesc;
+import org.apache.hadoop.hive.ql.plan.ListBucketingCtx;
+import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
+import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.PTFDesc;
+import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.ScriptDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.plan.UDTFDesc;
+import org.apache.hadoop.hive.ql.plan.UnionDesc;
+import org.apache.hadoop.hive.ql.plan.ptf.OrderExpressionDef;
+import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef;
+import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef;
+import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObject;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hadoop.hive.ql.session.SessionState.ResourceType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
+import org.apache.hadoop.hive.ql.util.ResourceDownloader;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.Deserializer;
+import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe;
+import org.apache.hadoop.hive.serde2.NoOpFetchFormatter;
+import org.apache.hadoop.hive.serde2.NullStructSerDe;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
+import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.thrift.ThriftJDBCBinarySerDe;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.hive.shims.HadoopShims;
+import org.apache.hadoop.hive.shims.Utils;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.InputFormat;
+import org.apache.hadoop.mapred.OutputFormat;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import com.google.common.base.Splitter;
+import com.google.common.base.Strings;
+import com.google.common.collect.Sets;
+import com.google.common.math.IntMath;
+
+/**
+ * Implementation of the semantic analyzer. It generates the query plan.
+ * There are other specific semantic analyzers for some hive operations such as
+ * DDLSemanticAnalyzer for ddl operations.
+ */
+
+public class SemanticAnalyzer extends BaseSemanticAnalyzer {
+
+  public static final String DUMMY_DATABASE = "_dummy_database";
+  public static final String DUMMY_TABLE = "_dummy_table";
+  public static final String SUBQUERY_TAG_1 = "-subquery1";
+  public static final String SUBQUERY_TAG_2 = "-subquery2";
+
+  // Max characters when auto generating the column name with func name
+  private static final int AUTOGEN_COLALIAS_PRFX_MAXLENGTH = 20;
+
+  public static final String VALUES_TMP_TABLE_NAME_PREFIX = "Values__Tmp__Table__";
+
+  static final String MATERIALIZATION_MARKER = "$MATERIALIZATION";
+
+  private HashMap<TableScanOperator, ExprNodeDesc> opToPartPruner;
+  private HashMap<TableScanOperator, PrunedPartitionList> opToPartList;
+  protected HashMap<String, TableScanOperator> topOps;
+  protected LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtx;
+  private List<LoadTableDesc> loadTableWork;
+  private List<LoadFileDesc> loadFileWork;
+  private List<ColumnStatsAutoGatherContext> columnStatsAutoGatherContexts;
+  private final Map<JoinOperator, QBJoinTree> joinContext;
+  private final Map<SMBMapJoinOperator, QBJoinTree> smbMapJoinContext;
+  private final HashMap<TableScanOperator, Table> topToTable;
+  private final Map<FileSinkOperator, Table> fsopToTable;
+  private final List<ReduceSinkOperator> reduceSinkOperatorsAddedByEnforceBucketingSorting;
+  private final HashMap<TableScanOperator, Map<String, String>> topToTableProps;
+  private QB qb;
+  private ASTNode ast;
+  private int destTableId;
+  private UnionProcContext uCtx;
+  List<AbstractMapJoinOperator<? extends MapJoinDesc>> listMapJoinOpsNoReducer;
+  private HashMap<TableScanOperator, SampleDesc> opToSamplePruner;
+  private final Map<TableScanOperator, Map<String, ExprNodeDesc>> opToPartToSkewedPruner;
+  private Map<SelectOperator, Table> viewProjectToTableSchema;
+  /**
+   * a map for the split sampling, from alias to an instance of SplitSample
+   * that describes percentage and number.
+   */
+  private final HashMap<String, SplitSample> nameToSplitSample;
+  Map<GroupByOperator, Set<String>> groupOpToInputTables;
+  Map<String, PrunedPartitionList> prunedPartitions;
+  protected List<FieldSchema> resultSchema;
+  protected CreateViewDesc createVwDesc;
+  protected ArrayList<String> viewsExpanded;
+  protected ASTNode viewSelect;
+  protected final UnparseTranslator unparseTranslator;
+  private final GlobalLimitCtx globalLimitCtx;
+
+  // prefix for column names auto generated by hive
+  private final String autogenColAliasPrfxLbl;
+  private final boolean autogenColAliasPrfxIncludeFuncName;
+
+  // Keep track of view alias to read entity corresponding to the view
+  // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T
+  // keeps track of aliases for V3, V3:V2, V3:V2:V1.
+  // This is used when T is added as an input for the query, the parents of T is
+  // derived from the alias V3:V2:V1:T
+  private final Map<String, ReadEntity> viewAliasToInput;
+
+  //need merge isDirect flag to input even if the newInput does not have a parent
+  private boolean mergeIsDirect;
+
+  // flag for no scan during analyze ... compute statistics
+  protected boolean noscan;
+
+  //flag for partial scan during analyze ... compute statistics
+  protected boolean partialscan;
+
+  protected volatile boolean disableJoinMerge = false;
+  protected final boolean defaultJoinMerge;
+
+  /*
+   * Capture the CTE definitions in a Query.
+   */
+  final Map<String, CTEClause> aliasToCTEs;
+
+  /*
+   * Used to check recursive CTE invocations. Similar to viewsExpanded
+   */
+  ArrayList<String> ctesExpanded;
+
+  /*
+   * Whether root tasks after materialized CTE linkage have been resolved
+   */
+  boolean rootTasksResolved;
+
+  protected TableMask tableMask;
+
+  CreateTableDesc tableDesc;
+
+  /** Not thread-safe. */
+  final ASTSearcher astSearcher = new ASTSearcher();
+
+  protected AnalyzeRewriteContext analyzeRewrite;
+
+  // A mapping from a tableName to a table object in metastore.
+  Map<String, Table> tabNameToTabObject;
+
+  // The tokens we should ignore when we are trying to do table masking.
+  private final Set<Integer> ignoredTokens = Sets.newHashSet(HiveParser.TOK_GROUPBY,
+      HiveParser.TOK_ORDERBY, HiveParser.TOK_WINDOWSPEC, HiveParser.TOK_CLUSTERBY,
+      HiveParser.TOK_DISTRIBUTEBY, HiveParser.TOK_SORTBY);
+
+  static class Phase1Ctx {
+    String dest;
+    int nextNum;
+  }
+
+  public SemanticAnalyzer(QueryState queryState) throws SemanticException {
+    super(queryState);
+    opToPartPruner = new HashMap<TableScanOperator, ExprNodeDesc>();
+    opToPartList = new HashMap<TableScanOperator, PrunedPartitionList>();
+    opToSamplePruner = new HashMap<TableScanOperator, SampleDesc>();
+    nameToSplitSample = new HashMap<String, SplitSample>();
+    // Must be deterministic order maps - see HIVE-8707
+    topOps = new LinkedHashMap<String, TableScanOperator>();
+    loadTableWork = new ArrayList<LoadTableDesc>();
+    loadFileWork = new ArrayList<LoadFileDesc>();
+    columnStatsAutoGatherContexts = new ArrayList<ColumnStatsAutoGatherContext>();
+    opParseCtx = new LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext>();
+    joinContext = new HashMap<JoinOperator, QBJoinTree>();
+    smbMapJoinContext = new HashMap<SMBMapJoinOperator, QBJoinTree>();
+    // Must be deterministic order map for consistent q-test output across Java versions
+    topToTable = new LinkedHashMap<TableScanOperator, Table>();
+    fsopToTable = new HashMap<FileSinkOperator, Table>();
+    reduceSinkOperatorsAddedByEnforceBucketingSorting = new ArrayList<ReduceSinkOperator>();
+    topToTableProps = new HashMap<TableScanOperator, Map<String, String>>();
+    destTableId = 1;
+    uCtx = null;
+    listMapJoinOpsNoReducer = new ArrayList<AbstractMapJoinOperator<? extends MapJoinDesc>>();
+    groupOpToInputTables = new HashMap<GroupByOperator, Set<String>>();
+    prunedPartitions = new HashMap<String, PrunedPartitionList>();
+    tabNameToTabObject = new HashMap<String, Table>();
+    unparseTranslator = new UnparseTranslator(conf);
+    autogenColAliasPrfxLbl = HiveConf.getVar(conf,
+        HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_LABEL);
+    autogenColAliasPrfxIncludeFuncName = HiveConf.getBoolVar(conf,
+        HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_INCLUDEFUNCNAME);
+    queryProperties = new QueryProperties();
+    opToPartToSkewedPruner = new HashMap<TableScanOperator, Map<String, ExprNodeDesc>>();
+    aliasToCTEs = new HashMap<String, CTEClause>();
+    globalLimitCtx = new GlobalLimitCtx();
+    viewAliasToInput = new HashMap<String, ReadEntity>();
+    mergeIsDirect = true;
+    noscan = partialscan = false;
+    tabNameToTabObject = new HashMap<>();
+    defaultJoinMerge = false == HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_MERGE_NWAY_JOINS);
+    disableJoinMerge = defaultJoinMerge;
+  }
+
+  @Override
+  protected void reset(boolean clearPartsCache) {
+    super.reset(true);
+    if(clearPartsCache) {
+      prunedPartitions.clear();
+
+      //When init(true) combine with genResolvedParseTree, it will generate Resolved Parse tree from syntax tree
+      //ReadEntity created under these conditions should be all relevant to the syntax tree even the ones without parents
+      //set mergeIsDirect to true here.
+      mergeIsDirect = true;
+    } else {
+      mergeIsDirect = false;
+    }
+    tabNameToTabObject.clear();
+    loadTableWork.clear();
+    loadFileWork.clear();
+    columnStatsAutoGatherContexts.clear();
+    topOps.clear();
+    destTableId = 1;
+    idToTableNameMap.clear();
+    qb = null;
+    ast = null;
+    uCtx = null;
+    joinContext.clear();
+    smbMapJoinContext.clear();
+    opParseCtx.clear();
+    groupOpToInputTables.clear();
+    disableJoinMerge = defaultJoinMerge;
+    aliasToCTEs.clear();
+    topToTable.clear();
+    opToPartPruner.clear();
+    opToPartList.clear();
+    opToPartToSkewedPruner.clear();
+    opToSamplePruner.clear();
+    nameToSplitSample.clear();
+    fsopToTable.clear();
+    resultSchema = null;
+    createVwDesc = null;
+    viewsExpanded = null;
+    viewSelect = null;
+    ctesExpanded = null;
+    globalLimitCtx.disableOpt();
+    viewAliasToInput.clear();
+    reduceSinkOperatorsAddedByEnforceBucketingSorting.clear();
+    topToTableProps.clear();
+    listMapJoinOpsNoReducer.clear();
+    unparseTranslator.clear();
+    queryProperties.clear();
+    outputs.clear();
+  }
+
+  public void initParseCtx(ParseContext pctx) {
+    opToPartPruner = pctx.getOpToPartPruner();
+    opToPartList = pctx.getOpToPartList();
+    opToSamplePruner = pctx.getOpToSamplePruner();
+    topOps = pctx.getTopOps();
+    loadTableWork = pctx.getLoadTableWork();
+    loadFileWork = pctx.getLoadFileWork();
+    ctx = pctx.getContext();
+    destTableId = pctx.getDestTableId();
+    idToTableNameMap = pctx.getIdToTableNameMap();
+    uCtx = pctx.getUCtx();
+    listMapJoinOpsNoReducer = pctx.getListMapJoinOpsNoReducer();
+    prunedPartitions = pctx.getPrunedPartitions();
+    tabNameToTabObject = pctx.getTabNameToTabObject();
+    fetchTask = pctx.getFetchTask();
+    setLineageInfo(pctx.getLineageInfo());
+  }
+
+  public ParseContext getParseContext() {
+    // Make sure the basic query properties are initialized
+    copyInfoToQueryProperties(queryProperties);
+    return new ParseContext(queryState, opToPartPruner, opToPartList, topOps,
+        new HashSet<JoinOperator>(joinContext.keySet()),
+        new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()),
+        loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx,
+        listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject,
+        opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks,
+        opToPartToSkewedPruner, viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting,
+        analyzeRewrite, tableDesc, createVwDesc, queryProperties, viewProjectToTableSchema, acidFileSinks);
+  }
+
+  public CompilationOpContext getOpContext() {
+    return ctx.getOpContext();
+  }
+
+  public void doPhase1QBExpr(ASTNode ast, QBExpr qbexpr, String id, String alias)
+      throws SemanticException {
+    doPhase1QBExpr(ast, qbexpr, id, alias, false);
+  }
+  @SuppressWarnings("nls")
+  public void doPhase1QBExpr(ASTNode ast, QBExpr qbexpr, String id, String alias, boolean insideView)
+      throws SemanticException {
+
+    assert (ast.getToken() != null);
+    if (ast.getToken().getType() == HiveParser.TOK_QUERY) {
+      QB qb = new QB(id, alias, true);
+      qb.setInsideView(insideView);
+      Phase1Ctx ctx_1 = initPhase1Ctx();
+      doPhase1(ast, qb, ctx_1, null);
+
+      qbexpr.setOpcode(QBExpr.Opcode.NULLOP);
+      qbexpr.setQB(qb);
+    }
+    // setop
+    else {
+      switch (ast.getToken().getType()) {
+      case HiveParser.TOK_UNIONALL:
+        qbexpr.setOpcode(QBExpr.Opcode.UNION);
+        break;
+      case HiveParser.TOK_INTERSECTALL:
+        qbexpr.setOpcode(QBExpr.Opcode.INTERSECTALL);
+        break;
+      case HiveParser.TOK_INTERSECTDISTINCT:
+        qbexpr.setOpcode(QBExpr.Opcode.INTERSECT);
+        break;
+      case HiveParser.TOK_EXCEPTALL:
+        qbexpr.setOpcode(QBExpr.Opcode.EXCEPTALL);
+        break;
+      case HiveParser.TOK_EXCEPTDISTINCT:
+        qbexpr.setOpcode(QBExpr.Opcode.EXCEPT);
+        break;
+      default:
+        throw new SemanticException(ErrorMsg.UNSUPPORTED_SET_OPERATOR.getMsg("Type "
+            + ast.getToken().getType()));
+      }
+      // query 1
+      assert (ast.getChild(0) != null);
+      QBExpr qbexpr1 = new QBExpr(alias + SUBQUERY_TAG_1);
+      doPhase1QBExpr((ASTNode) ast.getChild(0), qbexpr1, id + SUBQUERY_TAG_1, alias
+          + SUBQUERY_TAG_1, insideView);
+      qbexpr.setQBExpr1(qbexpr1);
+
+      // query 2
+      assert (ast.getChild(1) != null);
+      QBExpr qbexpr2 = new QBExpr(alias + SUBQUERY_TAG_2);
+      doPhase1QBExpr((ASTNode) ast.getChild(1), qbexpr2, id + SUBQUERY_TAG_2, alias
+          + SUBQUERY_TAG_2, insideView);
+      qbexpr.setQBExpr2(qbexpr2);
+    }
+  }
+
+  private LinkedHashMap<String, ASTNode> doPhase1GetAggregationsFromSelect(
+      ASTNode selExpr, QB qb, String dest) throws SemanticException {
+
+    // Iterate over the selects search for aggregation Trees.
+    // Use String as keys to eliminate duplicate trees.
+    LinkedHashMap<String, ASTNode> aggregationTrees = new LinkedHashMap<String, ASTNode>();
+    List<ASTNode> wdwFns = new ArrayList<ASTNode>();
+    for (int i = 0; i < selExpr.getChildCount(); ++i) {
+      ASTNode function = (ASTNode) selExpr.getChild(i);
+      if (function.getType() == HiveParser.TOK_SELEXPR ||
+          function.getType() == HiveParser.TOK_SUBQUERY_EXPR) {
+        function = (ASTNode)function.getChild(0);
+      }
+      doPhase1GetAllAggregations(function, aggregationTrees, wdwFns);
+    }
+
+    // window based aggregations are handled differently
+    for (ASTNode wdwFn : wdwFns) {
+      WindowingSpec spec = qb.getWindowingSpec(dest);
+      if(spec == null) {
+        queryProperties.setHasWindowing(true);
+        spec = new WindowingSpec();
+        qb.addDestToWindowingSpec(dest, spec);
+      }
+      HashMap<String, ASTNode> wExprsInDest = qb.getParseInfo().getWindowingExprsForClause(dest);
+      int wColIdx = spec.getWindowExpressions() == null ? 0 : spec.getWindowExpressions().size();
+      WindowFunctionSpec wFnSpec = processWindowFunction(wdwFn,
+        (ASTNode)wdwFn.getChild(wdwFn.getChildCount()-1));
+      // If this is a duplicate invocation of a function; don't add to WindowingSpec.
+      if ( wExprsInDest != null &&
+          wExprsInDest.containsKey(wFnSpec.getExpression().toStringTree())) {
+        continue;
+      }
+      wFnSpec.setAlias(wFnSpec.getName() + "_window_" + wColIdx);
+      spec.addWindowFunction(wFnSpec);
+      qb.getParseInfo().addWindowingExprToClause(dest, wFnSpec.getExpression());
+    }
+
+    return aggregationTrees;
+  }
+
+  private void doPhase1GetColumnAliasesFromSelect(
+      ASTNode selectExpr, QBParseInfo qbp) {
+    for (int i = 0; i < selectExpr.getChildCount(); ++i) {
+      ASTNode selExpr = (ASTNode) selectExpr.getChild(i);
+      if ((selExpr.getToken().getType() == HiveParser.TOK_SELEXPR)
+          && (selExpr.getChildCount() == 2)) {
+        String columnAlias = unescapeIdentifier(selExpr.getChild(1).getText());
+        qbp.setExprToColumnAlias((ASTNode) selExpr.getChild(0), columnAlias);
+      }
+    }
+  }
+
+  /**
+   * DFS-scan the expressionTree to find all aggregation subtrees and put them
+   * in aggregations.
+   *
+   * @param expressionTree
+   * @param aggregations
+   *          the key to the HashTable is the toStringTree() representation of
+   *          the aggregation subtree.
+   * @throws SemanticException
+   */
+  private void doPhase1GetAllAggregations(ASTNode expressionTree,
+      HashMap<String, ASTNode> aggregations, List<ASTNode> wdwFns) throws SemanticException {
+    int exprTokenType = expressionTree.getToken().getType();
+    if(exprTokenType == HiveParser.TOK_SUBQUERY_EXPR) {
+      //since now we have scalar subqueries we can get subquery expression in having
+      // we don't want to include aggregate from within subquery
+      return;
+    }
+
+    if (exprTokenType == HiveParser.TOK_FUNCTION
+        || exprTokenType == HiveParser.TOK_FUNCTIONDI
+        || exprTokenType == HiveParser.TOK_FUNCTIONSTAR) {
+      assert (expressionTree.getChildCount() != 0);
+      if (expressionTree.getChild(expressionTree.getChildCount()-1).getType()
+          == HiveParser.TOK_WINDOWSPEC) {
+        // If it is a windowing spec, we include it in the list
+        // Further, we will examine its children AST nodes to check whether
+        // there are aggregation functions within
+        wdwFns.add(expressionTree);
+        doPhase1GetAllAggregations((ASTNode) expressionTree.getChild(expressionTree.getChildCount()-1),
+                aggregations, wdwFns);
+        return;
+      }
+      if (expressionTree.getChild(0).getType() == HiveParser.Identifier) {
+        String functionName = unescapeIdentifier(expressionTree.getChild(0)
+            .getText());
+        // Validate the function name
+        if (FunctionRegistry.getFunctionInfo(functionName) == null) {
+          throw new SemanticException(ErrorMsg.INVALID_FUNCTION.getMsg(functionName));
+        }
+        if(FunctionRegistry.impliesOrder(functionName)) {
+          throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName));
+        }
+        if (FunctionRegistry.getGenericUDAFResolver(functionName) != null) {
+          if(containsLeadLagUDF(expressionTree)) {
+            throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName));
+          }
+          aggregations.put(expressionTree.toStringTree(), expressionTree);
+          FunctionInfo fi = FunctionRegistry.getFunctionInfo(functionName);
+          if (!fi.isNative()) {
+            unparseTranslator.addIdentifierTranslation((ASTNode) expressionTree
+                .getChild(0));
+          }
+          return;
+        }
+      }
+    }
+    for (int i = 0; i < expressionTree.getChildCount(); i++) {
+      doPhase1GetAllAggregations((ASTNode) expressionTree.getChild(i),
+          aggregations, wdwFns);
+    }
+  }
+
+  private List<ASTNode> doPhase1GetDistinctFuncExprs(
+      HashMap<String, ASTNode> aggregationTrees) throws SemanticException {
+    List<ASTNode> exprs = new ArrayList<ASTNode>();
+    for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
+      ASTNode value = entry.getValue();
+      assert (value != null);
+      if (value.getToken().getType() == HiveParser.TOK_FUNCTIONDI) {
+        exprs.add(value);
+      }
+    }
+    return exprs;
+  }
+
+  public static String generateErrorMessage(ASTNode ast, String message) {
+    StringBuilder sb = new StringBuilder();
+    if (ast == null) {
+      sb.append(message).append(". Cannot tell the position of null AST.");
+      return sb.toString();
+    }
+    sb.append(ast.getLine());
+    sb.append(":");
+    sb.append(ast.getCharPositionInLine());
+    sb.append(" ");
+    sb.append(message);
+    sb.append(". Error encountered near token '");
+    sb.append(ErrorMsg.getText(ast));
+    sb.append("'");
+    return sb.toString();
+  }
+
+  ASTNode getAST() {
+    return this.ast;
+  }
+
+  protected void setAST(ASTNode newAST) {
+    this.ast = newAST;
+  }
+
+  int[] findTabRefIdxs(ASTNode tabref) {
+    assert tabref.getType() == HiveParser.TOK_TABREF;
+    int aliasIndex = 0;
+    int propsIndex = -1;
+    int tsampleIndex = -1;
+    int ssampleIndex = -1;
+    for (int index = 1; index < tabref.getChildCount(); index++) {
+      ASTNode ct = (ASTNode) tabref.getChild(index);
+      if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) {
+        tsampleIndex = index;
+      } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) {
+        ssampleIndex = index;
+      } else if (ct.getToken().getType() == HiveParser.TOK_TABLEPROPERTIES) {
+        propsIndex = index;
+      } else {
+        aliasIndex = index;
+      }
+    }
+    return new int[] {aliasIndex, propsIndex, tsampleIndex, ssampleIndex};
+  }
+  String findSimpleTableName(ASTNode tabref, int aliasIndex) {
+    assert tabref.getType() == HiveParser.TOK_TABREF;
+    ASTNode tableTree = (ASTNode) (tabref.getChild(0));
+
+    String alias;
+    if (aliasIndex != 0) {
+      alias = unescapeIdentifier(tabref.getChild(aliasIndex).getText());
+    }
+    else {
+      alias = getUnescapedUnqualifiedTableName(tableTree);
+    }
+    return alias;
+  }
+  /**
+   * Goes though the tabref tree and finds the alias for the table. Once found,
+   * it records the table name-> alias association in aliasToTabs. It also makes
+   * an association from the alias to the table AST in parse info.
+   *
+   * @return the alias of the table
+   */
+  private String processTable(QB qb, ASTNode tabref) throws SemanticException {
+    // For each table reference get the table name
+    // and the alias (if alias is not present, the table name
+    // is used as an alias)
+    int[] indexes = findTabRefIdxs(tabref);
+    int aliasIndex = indexes[0];
+    int propsIndex = indexes[1];
+    int tsampleIndex = indexes[2];
+    int ssampleIndex = indexes[3];
+
+    ASTNode tableTree = (ASTNode) (tabref.getChild(0));
+
+    String tabIdName = getUnescapedName(tableTree).toLowerCase();
+
+    String alias = findSimpleTableName(tabref, aliasIndex);
+
+    if (propsIndex >= 0) {
+      Tree propsAST = tabref.getChild(propsIndex);
+      Map<String, String> props = DDLSemanticAnalyzer.getProps((ASTNode) propsAST.getChild(0));
+      // We get the information from Calcite.
+      if ("TRUE".equals(props.get("insideView"))) {
+        qb.getAliasInsideView().add(alias.toLowerCase());
+      }
+      qb.setTabProps(alias, props);
+    }
+
+    // If the alias is already there then we have a conflict
+    if (qb.exists(alias)) {
+      throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(tabref
+          .getChild(aliasIndex)));
+    }
+    if (tsampleIndex >= 0) {
+      ASTNode sampleClause = (ASTNode) tabref.getChild(tsampleIndex);
+      ArrayList<ASTNode> sampleCols = new ArrayList<ASTNode>();
+      if (sampleClause.getChildCount() > 2) {
+        for (int i = 2; i < sampleClause.getChildCount(); i++) {
+          sampleCols.add((ASTNode) sampleClause.getChild(i));
+        }
+      }
+      // TODO: For now only support sampling on up to two columns
+      // Need to change it to list of columns
+      if (sampleCols.size() > 2) {
+        throw new SemanticException(generateErrorMessage(
+            (ASTNode) tabref.getChild(0),
+            ErrorMsg.SAMPLE_RESTRICTION.getMsg()));
+      }
+      TableSample tabSample = new TableSample(
+              unescapeIdentifier(sampleClause.getChild(0).getText()),
+              unescapeIdentifier(sampleClause.getChild(1).getText()),
+              sampleCols);
+      qb.getParseInfo().setTabSample(alias, tabSample);
+      if (unparseTranslator.isEnabled()) {
+        for (ASTNode sampleCol : sampleCols) {
+          unparseTranslator.addIdentifierTranslation((ASTNode) sampleCol
+              .getChild(0));
+        }
+      }
+    } else if (ssampleIndex >= 0) {
+      ASTNode sampleClause = (ASTNode) tabref.getChild(ssampleIndex);
+
+      Tree type = sampleClause.getChild(0);
+      Tree numerator = sampleClause.getChild(1);
+      String value = unescapeIdentifier(numerator.getText());
+
+
+      SplitSample sample;
+      if (type.getType() == HiveParser.TOK_PERCENT) {
+        assertCombineInputFormat(numerator, "Percentage");
+        Double percent = Double.valueOf(value).doubleValue();
+        if (percent < 0  || percent > 100) {
+          throw new SemanticException(generateErrorMessage((ASTNode) numerator,
+              "Sampling percentage should be between 0 and 100"));
+        }
+        int seedNum = conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM);
+        sample = new SplitSample(percent, seedNum);
+      } else if (type.getType() == HiveParser.TOK_ROWCOUNT) {
+        sample = new SplitSample(Integer.parseInt(value));
+      } else {
+        assert type.getType() == HiveParser.TOK_LENGTH;
+        assertCombineInputFormat(numerator, "Total Length");
+        long length = Integer.parseInt(value.substring(0, value.length() - 1));
+        char last = value.charAt(value.length() - 1);
+        if (last == 'k' || last == 'K') {
+          length <<= 10;
+        } else if (last == 'm' || last == 'M') {
+          length <<= 20;
+        } else if (last == 'g' || last == 'G') {
+          length <<= 30;
+        }
+        int seedNum = conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM);
+        sample = new SplitSample(length, seedNum);
+      }
+      String alias_id = getAliasId(alias, qb);
+      nameToSplitSample.put(alias_id, sample);
+    }
+    // Insert this map into the stats
+    qb.setTabAlias(alias, tabIdName);
+    if (qb.isInsideView()) {
+      qb.getAliasInsideView().add(alias.toLowerCase());
+    }
+    qb.addAlias(alias);
+
+    qb.getParseInfo().setSrcForAlias(alias, tableTree);
+
+    // if alias to CTE contains the table name, we do not do the translation because
+    // cte is actually a subquery.
+    if (!this.aliasToCTEs.containsKey(tabIdName)) {
+      unparseTranslator.addTableNameTranslation(tableTree, SessionState.get().getCurrentDatabase());
+      if (aliasIndex != 0) {
+        unparseTranslator.addIdentifierTranslation((ASTNode) tabref.getChild(aliasIndex));
+      }
+    }
+
+    return alias;
+  }
+
+  Map<String, SplitSample> getNameToSplitSampleMap() {
+    return this.nameToSplitSample;
+  }
+
+  /**
+   * Convert a string to Text format and write its bytes in the same way TextOutputFormat would do.
+   * This is needed to properly encode non-ascii characters.
+   */
+  private static void writeAsText(String text, FSDataOutputStream out) throws IOException {
+    Text to = new Text(text);
+    out.write(to.getBytes(), 0, to.getLength());
+  }
+
+  /**
+   * Generate a temp table out of a values clause
+   * See also {@link #preProcessForInsert(ASTNode, QB)}
+   */
+  private ASTNode genValuesTempTable(ASTNode originalFrom, QB qb) throws SemanticException {
+    Path dataDir = null;
+    if(!qb.getEncryptedTargetTablePaths().isEmpty()) {
+      //currently only Insert into T values(...) is supported thus only 1 values clause
+      //and only 1 target table are possible.  If/when support for
+      //select ... from values(...) is added an insert statement may have multiple
+      //encrypted target tables.
+      dataDir = ctx.getMRTmpPath(qb.getEncryptedTargetTablePaths().get(0).toUri());
+    }
+    // Pick a name for the table
+    SessionState ss = SessionState.get();
+    String tableName = VALUES_TMP_TABLE_NAME_PREFIX + ss.getNextValuesTempTableSuffix();
+
+    // Step 1, parse the values clause we were handed
+    List<? extends Node> fromChildren = originalFrom.getChildren();
+    // First child should be the virtual table ref
+    ASTNode virtualTableRef = (ASTNode)fromChildren.get(0);
+    assert virtualTableRef.getToken().getType() == HiveParser.TOK_VIRTUAL_TABREF :
+        "Expected first child of TOK_VIRTUAL_TABLE to be TOK_VIRTUAL_TABREF but was " +
+            virtualTableRef.getName();
+
+    List<? extends Node> virtualTableRefChildren = virtualTableRef.getChildren();
+    // First child of this should be the table name.  If it's anonymous,
+    // then we don't have a table name.
+    ASTNode tabName = (ASTNode)virtualTableRefChildren.get(0);
+    if (tabName.getToken().getType() != HiveParser.TOK_ANONYMOUS) {
+      // TODO, if you want to make select ... from (values(...) as foo(...) work,
+      // you need to parse this list of columns names and build it into the table
+      throw new SemanticException(ErrorMsg.VALUES_TABLE_CONSTRUCTOR_NOT_SUPPORTED.getMsg());
+    }
+
+    // The second child of the TOK_VIRTUAL_TABLE should be TOK_VALUES_TABLE
+    ASTNode valuesTable = (ASTNode)fromChildren.get(1);
+    assert valuesTable.getToken().getType() == HiveParser.TOK_VALUES_TABLE :
+        "Expected second child of TOK_VIRTUAL_TABLE to be TOK_VALUE_TABLE but was " +
+            valuesTable.getName();
+    // Each of the children of TOK_VALUES_TABLE will be a TOK_VALUE_ROW
+    List<? extends Node> valuesTableChildren = valuesTable.getChildren();
+
+    // Now that we're going to start reading through the rows, open a file to write the rows too
+    // If we leave this method before creating the temporary table we need to be sure to clean up
+    // this file.
+    Path tablePath = null;
+    FileSystem fs = null;
+    FSDataOutputStream out = null;
+    try {
+      if(dataDir == null) {
+        tablePath = Warehouse.getDnsPath(new Path(ss.getTempTableSpace(), tableName), conf);
+      }
+      else {
+        //if target table of insert is encrypted, make sure temporary table data is stored
+        //similarly encrypted
+        tablePath = Warehouse.getDnsPath(new Path(dataDir, tableName), conf);
+      }
+      fs = tablePath.getFileSystem(conf);
+      fs.mkdirs(tablePath);
+      Path dataFile = new Path(tablePath, "data_file");
+      out = fs.create(dataFile);
+      List<FieldSchema> fields = new ArrayList<FieldSchema>();
+
+      boolean firstRow = true;
+      for (Node n : valuesTableChildren) {
+        ASTNode valuesRow = (ASTNode) n;
+        assert valuesRow.getToken().getType() == HiveParser.TOK_VALUE_ROW :
+            "Expected child of TOK_VALUE_TABLE to be TOK_VALUE_ROW but was " + valuesRow.getName();
+        // Each of the children of this should be a literal
+        List<? extends Node> valuesRowChildren = valuesRow.getChildren();
+        boolean isFirst = true;
+        int nextColNum = 1;
+        for (Node n1 : valuesRowChildren) {
+          ASTNode value = (ASTNode) n1;
+          if (firstRow) {
+            fields.add(new FieldSchema("tmp_values_col" + nextColNum++, "string", ""));
+          }
+          if (isFirst) isFirst = false;
+          else writeAsText("\u0001", out);
+          writeAsText(unparseExprForValuesClause(value), out);
+        }
+        writeAsText("\n", out);
+        firstRow = false;
+      }
+
+      // Step 2, create a temp table, using the created file as the data
+      StorageFormat format = new StorageFormat(conf);
+      format.processStorageFormat("TextFile");
+      Table table = db.newTable(tableName);
+      table.setSerializationLib(format.getSerde());
+      table.setFields(fields);
+      table.setDataLocation(tablePath);
+      table.getTTable().setTemporary(true);
+      table.setStoredAsSubDirectories(false);
+      table.setInputFormatClass(format.getInputFormat());
+      table.setOutputFormatClass(format.getOutputFormat());
+      db.createTable(table, false);
+    } catch (Exception e) {
+      String errMsg = ErrorMsg.INSERT_CANNOT_CREATE_TEMP_FILE.getMsg() + e.getMessage();
+      LOG.error(errMsg);
+      // Try to delete the file
+      if (fs != null && tablePath != null) {
+        try {
+          fs.delete(tablePath, false);
+        } catch (IOException swallowIt) {}
+      }
+      throw new SemanticException(errMsg, e);
+    } finally {
+        IOUtils.closeStream(out);
+    }
+
+    // Step 3, return a new subtree with a from clause built around that temp table
+    // The form of the tree is TOK_TABREF->TOK_TABNAME->identifier(tablename)
+    Token t = new ClassicToken(HiveParser.TOK_TABREF);
+    ASTNode tabRef = new ASTNode(t);
+    t = new ClassicToken(HiveParser.TOK_TABNAME);
+    ASTNode tabNameNode = new ASTNode(t);
+    tabRef.addChild(tabNameNode);
+    t = new ClassicToken(HiveParser.Identifier, tableName);
+    ASTNode identifier = new ASTNode(t);
+    tabNameNode.addChild(identifier);
+    return tabRef;
+  }
+
+  // Take an expression in the values clause and turn it back into a string.  This is far from
+  // comprehensive.  At the moment it only supports:
+  // * literals (all types)
+  // * unary negatives
+  // * true/false
+  private String unparseExprForValuesClause(ASTNode expr) throws SemanticException {
+    switch (expr.getToken().getType()) {
+      case HiveParser.Number:
+        return expr.getText();
+
+      case HiveParser.StringLiteral:
+        return BaseSemanticAnalyzer.unescapeSQLString(expr.getText());
+
+      case HiveParser.KW_FALSE:
+        // UDFToBoolean casts any non-empty string to true, so set this to false
+        return "";
+
+      case HiveParser.KW_TRUE:
+        return "TRUE";
+
+      case HiveParser.MINUS:
+        return "-" + unparseExprForValuesClause((ASTNode)expr.getChildren().get(0));
+
+      case HiveParser.TOK_NULL:
+        // Hive's text input will translate this as a null
+        return "\\N";
+
+      default:
+        throw new SemanticException("Expression of type " + expr.getText() +
+            " not supported in insert/values");
+    }
+
+  }
+
+  private void assertCombineInputFormat(Tree numerator, String message) throws SemanticException {
+    String inputFormat = conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") ?
+      HiveConf.getVar(conf, HiveConf.ConfVars.HIVETEZINPUTFORMAT):
+      HiveConf.getVar(conf, HiveConf.ConfVars.HIVEINPUTFORMAT);
+    if (!inputFormat.equals(CombineHiveInputFormat.class.getName())) {
+      throw new SemanticException(generateErrorMessage((ASTNode) numerator,
+          message + " sampling is not supported in " + inputFormat));
+    }
+  }
+
+  private String processSubQuery(QB qb, ASTNode subq) throws SemanticException {
+
+    // This is a subquery and must have an alias
+    if (subq.getChildCount() != 2) {
+      throw new SemanticException(ErrorMsg.NO_SUBQUERY_ALIAS.getMsg(subq));
+    }
+    ASTNode subqref = (ASTNode) subq.getChild(0);
+    String alias = unescapeIdentifier(subq.getChild(1).getText());
+
+    // Recursively do the first phase of semantic analysis for the subquery
+    QBExpr qbexpr = new QBExpr(alias);
+
+    doPhase1QBExpr(subqref, qbexpr, qb.getId(), alias, qb.isInsideView());
+
+    // If the alias is already there then we have a conflict
+    if (qb.exists(alias)) {
+      throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(subq
+          .getChild(1)));
+    }
+    // Insert this map into the stats
+    qb.setSubqAlias(alias, qbexpr);
+    qb.addAlias(alias);
+
+    unparseTranslator.addIdentifierTranslation((ASTNode) subq.getChild(1));
+
+    return alias;
+  }
+
+  /*
+   * Phase1: hold onto any CTE definitions in aliasToCTE.
+   * CTE definitions are global to the Query.
+   */
+  private void processCTE(QB qb, ASTNode ctes) throws SemanticException {
+
+    int numCTEs = ctes.getChildCount();
+
+    for(int i=0; i <numCTEs; i++) {
+      ASTNode cte = (ASTNode) ctes.getChild(i);
+      ASTNode cteQry = (ASTNode) cte.getChild(0);
+      String alias = unescapeIdentifier(cte.getChild(1).getText());
+
+      String qName = qb.getId() == null ? "" : qb.getId() + ":";
+      qName += alias.toLowerCase();
+
+      if ( aliasToCTEs.containsKey(qName)) {
+        throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(cte.getChild(1)));
+      }
+      aliasToCTEs.put(qName, new CTEClause(qName, cteQry));
+    }
+  }
+
+  /*
+   * We allow CTE definitions in views. So we can end up with a hierarchy of CTE definitions:
+   * - at the top level of a query statement
+   * - where a view is referenced.
+   * - views may refer to other views.
+   *
+   * The scoping rules we use are: to search for a CTE from the current QB outwards. In order to
+   * disambiguate between CTES are different levels we qualify(prefix) them with the id of the QB
+   * they appear in when adding them to the <code>aliasToCTEs</code> map.
+   *
+   */
+  private CTEClause findCTEFromName(QB qb, String cteName) {
+    StringBuilder qId = new StringBuilder();
+    if (qb.getId() != null) {
+      qId.append(qb.getId());
+    }
+
+    while (qId.length() > 0) {
+      String nm = qId + ":" + cteName;
+      CTEClause cte = aliasToCTEs.get(nm);
+      if (cte != null) {
+        return cte;
+      }
+      int lastIndex = qId.lastIndexOf(":");
+      lastIndex = lastIndex < 0 ? 0 : lastIndex;
+      qId.setLength(lastIndex);
+    }
+    return aliasToCTEs.get(cteName);
+  }
+
+  /*
+   * If a CTE is referenced in a QueryBlock:
+   * - add it as a SubQuery for now.
+   *   - SQ.alias is the alias used in QB. (if no alias is specified,
+   *     it used the CTE name. Works just like table references)
+   *   - Adding SQ done by:
+   *     - copying AST of CTE
+   *     - setting ASTOrigin on cloned AST.
+   *   - trigger phase 1 on new QBExpr.
+   *   - update QB data structs: remove this as a table reference, move it to a SQ invocation.
+   */
+  private void addCTEAsSubQuery(QB qb, String cteName, String cteAlias)
+      throws SemanticException {
+    cteAlias = cteAlias == null ? cteName : cteAlias;
+    CTEClause cte = findCTEFromName(qb, cteName);
+    ASTNode cteQryNode = cte.cteNode;
+    QBExpr cteQBExpr = new QBExpr(cteAlias);
+    doPhase1QBExpr(cteQryNode, cteQBExpr, qb.getId(), cteAlias);
+    qb.rewriteCTEToSubq(cteAlias, cteName, cteQBExpr);
+  }
+
+  private final CTEClause rootClause = new CTEClause(null, null);
+
+  @Override
+  public List<Task<? extends Serializable>> getAllRootTasks() {
+    if (!rootTasksResolved) {
+      rootTasks = toRealRootTasks(rootClause.asExecutionOrder());
+      rootTasksResolved = true;
+    }
+    return rootTasks;
+  }
+
+  @Override
+  public HashSet<ReadEntity> getAllInputs() {
+    HashSet<ReadEntity> readEntities = new HashSet<ReadEntity>(getInputs());
+    for (CTEClause cte : rootClause.asExecutionOrder()) {
+      if (cte.source != null) {
+        readEntities.addAll(cte.source.getInputs());
+      }
+    }
+    return readEntities;
+  }
+
+  @Override
+  public HashSet<WriteEntity> getAllOutputs() {
+    HashSet<WriteEntity> writeEntities = new HashSet<WriteEntity>(getOutputs());
+    for (CTEClause cte : rootClause.asExecutionOrder()) {
+      if (cte.source != null) {
+        writeEntities.addAll(cte.source.getOutputs());
+      }
+    }
+    return writeEntities;
+  }
+
+  class CTEClause {
+    CTEClause(String alias, ASTNode cteNode) {
+      this.alias = alias;
+      this.cteNode = cteNode;
+    }
+    String alias;
+    ASTNode cteNode;
+    boolean materialize;
+    int reference;
+    QBExpr qbExpr;
+    List<CTEClause> parents = new ArrayList<CTEClause>();
+
+    // materialized
+    Table table;
+    SemanticAnalyzer source;
+
+    List<Task<? extends Serializable>> getTasks() {
+      return source == null ? null : source.rootTasks;
+    }
+
+    List<CTEClause> asExecutionOrder() {
+      List<CTEClause> execution = new ArrayList<CTEClause>();
+      asExecutionOrder(new HashSet<CTEClause>(), execution);
+      return execution;
+    }
+
+    void asExecutionOrder(Set<CTEClause> visited, List<CTEClause> execution) {
+      for (CTEClause parent : parents) {
+        if (visited.add(parent)) {
+          parent.asExecutionOrder(visited, execution);
+        }
+      }
+      execution.add(this);
+    }
+
+    @Override
+    public String toString() {
+      return alias == null ? "<root>" : alias;
+    }
+  }
+
+  private List<Task<? extends Serializable>> toRealRootTasks(List<CTEClause> execution) {
+    List<Task<? extends Serializable>> cteRoots = new ArrayList<>();
+    List<Task<? extends Serializable>> cteLeafs = new ArrayList<>();
+    List<Task<? extends Serializable>> curTopRoots = null;
+    List<Task<? extends Serializable>> curBottomLeafs = null;
+    for (int i = 0; i < execution.size(); i++) {
+      CTEClause current = execution.get(i);
+      if (current.parents.isEmpty() && curTopRoots != null) {
+        cteRoots.addAll(curTopRoots);
+        cteLeafs.addAll(curBottomLeafs);
+        curTopRoots = curBottomLeafs = null;
+      }
+      List<Task<? extends Serializable>> curTasks = current.getTasks();
+      if (curTasks == null) {
+        continue;
+      }
+      if (curTopRoots == null) {
+        curTopRoots = curTasks;
+      }
+      if (curBottomLeafs != null) {
+        for (Task<?> topLeafTask : curBottomLeafs) {
+          for (Task<?> currentRootTask : curTasks) {
+            topLeafTask.addDependentTask(currentRootTask);
+          }
+        }
+      }
+      curBottomLeafs = Task.findLeafs(curTasks);
+    }
+    if (curTopRoots != null) {
+      cteRoots.addAll(curTopRoots);
+      cteLeafs.addAll(curBottomLeafs);
+    }
+
+    if (cteRoots.isEmpty()) {
+      return rootTasks;
+    }
+    for (Task<?> cteLeafTask : cteLeafs) {
+      for (Task<?> mainRootTask : rootTasks) {
+        cteLeafTask.addDependentTask(mainRootTask);
+      }
+    }
+    return cteRoots;
+  }
+
+  Table materializeCTE(String cteName, CTEClause cte) throws HiveException {
+
+    ASTNode createTable = new ASTNode(new ClassicToken(HiveParser.TOK_CREATETABLE));
+
+    ASTNode tableName = new ASTNode(new ClassicToken(HiveParser.TOK_TABNAME));
+    tableName.addChild(new ASTNode(new ClassicToken(HiveParser.Identifier, cteName)));
+
+    ASTNode temporary = new ASTNode(new ClassicToken(HiveParser.KW_TEMPORARY, MATERIALIZATION_MARKER));
+
+    createTable.addChild(tableName);
+    createTable.addChild(temporary);
+    createTable.addChild(cte.cteNode);
+
+    SemanticAnalyzer analyzer = new SemanticAnalyzer(queryState);
+    analyzer.initCtx(ctx);
+    analyzer.init(false);
+
+    // should share cte contexts
+    analyzer.aliasToCTEs.putAll(aliasToCTEs);
+
+    HiveOperation operation = queryState.getHiveOperation();
+    try {
+      analyzer.analyzeInternal(createTable);
+    } finally {
+      queryState.setCommandType(operation);
+    }
+
+    Table table = analyzer.tableDesc.toTable(conf);
+    Path location = table.getDataLocation();
+    try {
+      location.getFileSystem(conf).mkdirs(location);
+    } catch (IOException e) {
+      throw new HiveException(e);
+    }
+    table.setMaterializedTable(true);
+
+    LOG.info(cteName + " will be materialized into " + location);
+    cte.table = table;
+    cte.source = analyzer;
+
+    ctx.addMaterializedTable(cteName, table);
+
+    return table;
+  }
+
+
+  static boolean isJoinToken(ASTNode node) {
+    if ((node.getToken().getType() == HiveParser.TOK_JOIN)
+        || (node.getToken().getType() == HiveParser.TOK_CROSSJOIN)
+        || isOuterJoinToken(node)
+        || (node.getToken().getType() == HiveParser.TOK_LEFTSEMIJOIN)
+        || (node.getToken().getType() == HiveParser.TOK_UNIQUEJOIN)) {
+      return true;
+    }
+
+    return false;
+  }
+
+  static private boolean isOuterJoinToken(ASTNode node) {
+    return (node.getToken().getType() == HiveParser.TOK_LEFTOUTERJOIN)
+      || (node.getToken().getType() == HiveParser.TOK_RIGHTOUTERJOIN)
+      || (node.getToken().getType() == HiveParser.TOK_FULLOUTERJOIN);
+  }
+
+  /**
+   * Given the AST with TOK_JOIN as the root, get all the aliases for the tables
+   * or subqueries in the join.
+   *
+   * @param qb
+   * @param join
+   * @throws SemanticException
+   */
+  @SuppressWarnings("nls")
+  private void processJoin(QB qb, ASTNode join) throws SemanticException {
+    int numChildren = join.getChildCount();
+    if ((numChildren != 2) && (numChildren != 3)
+        && join.getToken().getType() != HiveParser.TOK_UNIQUEJOIN) {
+      throw new SemanticException(generateErrorMessage(join,
+          "Join with multiple children"));
+    }
+
+    queryProperties.incrementJoinCount(isOuterJoinToken(join));
+    for (int num = 0; num < numChildren; num++) {
+      ASTNode child = (ASTNode) join.getChild(num);
+      if (child.getToken().getType() == HiveParser.TOK_TABREF) {
+        processTable(qb, child);
+      } else if (child.getToken().getType() == HiveParser.TOK_SUBQUERY) {
+        processSubQuery(qb, child);
+      } else if (child.getToken().getType() == HiveParser.TOK_PTBLFUNCTION) {
+        queryProperties.setHasPTF(true);
+        processPTF(qb, child);
+        PTFInvocationSpec ptfInvocationSpec = qb.getPTFInvocationSpec(child);
+        String inputAlias = ptfInvocationSpec == null ? null :
+          ptfInvocationSpec.getFunction().getAlias();;
+        if ( inputAlias == null ) {
+          throw new SemanticException(generateErrorMessage(child,
+              "PTF invocation in a Join must have an alias"));
+        }
+
+      } else if (child.getToken().getType() == HiveParser.TOK_LATERAL_VIEW ||
+          child.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) {
+        // SELECT * FROM src1 LATERAL VIEW udtf() AS myTable JOIN src2 ...
+        // is not supported. Instead, the lateral view must be in a subquery
+        // SELECT * FROM (SELECT * FROM src1 LATERAL VIEW udtf() AS myTable) a
+        // JOIN src2 ...
+        throw new SemanticException(ErrorMsg.LATERAL_VIEW_WITH_JOIN
+            .getMsg(join));
+      } else if (isJoinToken(child)) {
+        processJoin(qb, child);
+      }
+    }
+  }
+
+  /**
+   * Given the AST with TOK_LATERAL_VIEW as the root, get the alias for the
+   * table or subquery in the lateral view and also make a mapping from the
+   * alias to all the lateral view AST's.
+   *
+   * @param qb
+   * @param lateralView
+   * @return the alias for the table/subquery
+   * @throws SemanticException
+   */
+
+  private String processLateralView(QB qb, ASTNode lateralView)
+      throws SemanticException {
+    int numChildren = lateralView.getChildCount();
+
+    assert (numChildren == 2);
+    ASTNode next = (ASTNode) lateralView.getChild(1);
+
+    String alias = null;
+
+    switch (next.getToken().getType()) {
+    case HiveParser.TOK_TABREF:
+      alias = processTable(qb, next);
+      break;
+    case HiveParser.TOK_SUBQUERY:
+      alias = processSubQuery(qb, next);
+      break;
+    case HiveParser.TOK_LATERAL_VIEW:
+    case HiveParser.TOK_LATERAL_VIEW_OUTER:
+      alias = processLateralView(qb, next);
+      break;
+    default:
+      throw new SemanticException(ErrorMsg.LATERAL_VIEW_INVALID_CHILD
+          .getMsg(lateralView));
+    }
+    alias = alias.toLowerCase();
+    qb.getParseInfo().addLateralViewForAlias(alias, lateralView);
+    qb.addAlias(alias);
+    return alias;
+  }
+
+  /**
+   * Phase 1: (including, but not limited to):
+   *
+   * 1. Gets all the aliases for all the tables / subqueries and makes the
+   * appropriate mapping in aliasToTabs, aliasToSubq 2. Gets the location of the
+   * destination and names the clause "inclause" + i 3. Creates a map from a
+   * string representation of an aggregation tree to the actual aggregation AST
+   * 4. Creates a mapping from the clause name to the select expression AST in
+   * destToSelExpr 5. Creates a mapping from a table alias to the lateral view
+   * AST's in aliasToLateralViews
+   *
+   * @param ast
+   * @param qb
+   * @param ctx_1
+   * @throws SemanticException
+   */
+  @SuppressWarnings({"fallthrough", "nls"})
+  public boolean doPhase1(ASTNode ast, QB qb, Phase1Ctx ctx_1, PlannerContext plannerCtx)
+      throws SemanticException {
+
+    boolean phase1Result = true;
+    QBParseInfo qbp = qb.getParseInfo();
+    boolean skipRecursion = false;
+
+    if (ast.getToken() != null) {
+      skipRecursion = true;
+      switch (ast.getToken().getType()) {
+      case HiveParser.TOK_SELECTDI:
+        qb.countSelDi();
+        // fall through
+      case HiveParser.TOK_SELECT:
+        qb.countSel();
+        qbp.setSelExprForClause(ctx_1.dest, ast);
+
+        int posn = 0;
+        if (((ASTNode) ast.getChild(0)).getToken().getType() == HiveParser.QUERY_HINT) {
+          ParseDriver pd = new ParseDriver();
+          String queryHintStr = ast.getChild(0).getText();
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("QUERY HINT: "+queryHintStr);
+          }
+          try {
+            ASTNode hintNode = pd.parseHint(queryHintStr);
+            qbp.setHints((ASTNode) hintNode);
+            posn++;
+          } catch (ParseException e) {
+            throw new SemanticException("failed to parse query hint: "+e.getMessage(), e);
+          }
+        }
+
+        if ((ast.getChild(posn).getChild(0).getType() == HiveParser.TOK_TRANSFORM))
+          queryProperties.setUsesScript(true);
+
+        LinkedHashMap<String, ASTNode> aggregations = doPhase1GetAggregationsFromSelect(ast,
+            qb, ctx_1.dest);
+        doPhase1GetColumnAliasesFromSelect(ast, qbp);
+        qbp.setAggregationExprsForClause(ctx_1.dest, aggregations);
+        qbp.setDistinctFuncExprsForClause(ctx_1.dest,
+          doPhase1GetDistinctFuncExprs(aggregations));
+        break;
+
+      case HiveParser.TOK_WHERE:
+        qbp.setWhrExprForClause(ctx_1.dest, ast);
+        if (!SubQueryUtils.findSubQueries((ASTNode) ast.getChild(0)).isEmpty())
+            queryProperties.setFilterWithSubQuery(true);
+        break;
+
+      case HiveParser.TOK_INSERT_INTO:
+        String currentDatabase = SessionState.get().getCurrentDatabase();
+        String tab_name = getUnescapedName((ASTNode) ast.getChild(0).getChild(0), currentDatabase);
+        qbp.addInsertIntoTable(tab_name, ast);
+
+      case HiveParser.TOK_DESTINATION:
+        ctx_1.dest = this.ctx.getDestNamePrefix(ast).toString() + ctx_1.nextNum;
+        ctx_1.nextNum++;
+        boolean isTmpFileDest = false;
+        if (ast.getChildCount() > 0 && ast.getChild(0) instanceof ASTNode) {
+          ASTNode ch = (ASTNode) ast.getChild(0);
+          if (ch.getToken().getType() == HiveParser.TOK_DIR && ch.getChildCount() > 0
+              && ch.getChild(0) instanceof ASTNode) {
+            ch = (ASTNode) ch.getChild(0);
+            isTmpFileDest = ch.getToken().getType() == HiveParser.TOK_TMP_FILE;
+          } else {
+            if (ast.getToken().getType() == HiveParser.TOK_DESTINATION
+                && ast.getChild(0).getType() == HiveParser.TOK_TAB) {
+              String fullTableName = getUnescapedName((ASTNode) ast.getChild(0).getChild(0),
+                  SessionState.get().getCurrentDatabase());
+              qbp.getInsertOverwriteTables().put(fullTableName, ast);
+            }
+          }
+        }
+
+        // is there a insert in the subquery
+        if (qbp.getIsSubQ() && !isTmpFileDest) {
+          throw new SemanticException(ErrorMsg.NO_INSERT_INSUBQUERY.getMsg(ast));
+        }
+
+        qbp.setDestForClause(ctx_1.dest, (ASTNode) ast.getChild(0));
+        handleInsertStatementSpecPhase1(ast, qbp, ctx_1);
+
+        if (qbp.getClauseNamesForDest().size() == 2) {
+          // From the moment that we have two destination clauses,
+          // we know that this is a multi-insert query.
+          // Thus, set property to right value.
+          // Using qbp.getClauseNamesForDest().size() >= 2 would be
+          // equivalent, but we use == to avoid setting the property
+          // multiple times
+          queryProperties.setMultiDestQuery(true);
+        }
+
+        if (plannerCtx != null && !queryProperties.hasMultiDestQuery()) {
+          plannerCtx.setInsertToken(ast, isTmpFileDest);
+        } else if (plannerCtx != null && qbp.getClauseNamesForDest().size() == 2) {
+          // For multi-insert query, currently we only optimize the FROM clause.
+          // Hence, introduce multi-insert token on top of it.
+          // However, first we need to reset existing token (insert).
+          // Using qbp.getClauseNamesForDest().size() >= 2 would be
+          // equivalent, but we use == to avoid setting the property
+          // multiple times
+          plannerCtx.resetToken();
+          plannerCtx.setMultiInsertToken((ASTNode) qbp.getQueryFrom().getChild(0));
+        }
+        break;
+
+      case HiveParser.TOK_FROM:
+        int child_count = ast.getChildCount();
+        if (child_count != 1) {
+          throw new SemanticException(generateErrorMessage(ast,
+              "Multiple Children " + child_count));
+        }
+
+        if (!qbp.getIsSubQ()) {
+          qbp.setQueryFromExpr(ast);
+        }
+
+        // Check if this is a subquery / lateral view
+        ASTNode frm = (ASTNode) ast.getChild(0);
+        if (frm.getToken().getType() == HiveParser.TOK_TABREF) {
+          processTable(qb, frm);
+        } else if (frm.getToken().getType() == HiveParser.TOK_VIRTUAL_TABLE) {
+          // Create a temp table with the passed values in it then rewrite this portion of the
+          // tree to be from that table.
+          ASTNode newFrom = genValuesTempTable(frm, qb);
+          ast.setChild(0, newFrom);
+          processTable(qb, newFrom);
+        } else if (frm.getToken().getType() == HiveParser.TOK_SUBQUERY) {
+          processSubQuery(qb, frm);
+        } else if (frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW ||
+            frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) {
+          queryProperties.setHasLateralViews(true);
+          processLateralView(qb, frm);
+        } else if (isJoinToken(frm)) {
+          processJoin(qb, frm);
+          qbp.setJoinExpr(frm);
+        }else if(frm.getToken().getType() == HiveParser.TOK_PTBLFUNCTION){
+          queryProperties.setHasPTF(true);
+          processPTF(qb, frm);
+        }
+        break;
+
+      case HiveParser.TOK_CLUSTERBY:
+        // Get the clusterby aliases - these are aliased to the entries in the
+        // select list
+        queryProperties.setHasClusterBy(true);
+        qbp.setClusterByExprForClause(ctx_1.dest, ast);
+        break;
+
+      case HiveParser.TOK_DISTRIBUTEBY:
+        // Get the distribute by aliases - these are aliased to the entries in
+        // the
+        // select list
+        queryProperties.setHasDistributeBy(true);
+        qbp.setDistributeByExprForClause(ctx_1.dest, ast);
+        if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.CLUSTERBY_DISTRIBUTEBY_CONFLICT.getMsg()));
+        } else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.ORDERBY_DISTRIBUTEBY_CONFLICT.getMsg()));
+        }
+        break;
+
+      case HiveParser.TOK_SORTBY:
+     // Get the sort by aliases - these are aliased to the entries in the
+        // select list
+        queryProperties.setHasSortBy(true);
+        qbp.setSortByExprForClause(ctx_1.dest, ast);
+        if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.CLUSTERBY_SORTBY_CONFLICT.getMsg()));
+        } else if (qbp.getOrderByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.ORDERBY_SORTBY_CONFLICT.getMsg()));
+        }
+
+        break;
+
+      case HiveParser.TOK_ORDERBY:
+        // Get the order by aliases - these are aliased to the entries in the
+        // select list
+        queryProperties.setHasOrderBy(true);
+        qbp.setOrderByExprForClause(ctx_1.dest, ast);
+        if (qbp.getClusterByForClause(ctx_1.dest) != null) {
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.CLUSTERBY_ORDERBY_CONFLICT.getMsg()));
+        }
+        break;
+
+      case HiveParser.TOK_GROUPBY:
+      case HiveParser.TOK_ROLLUP_GROUPBY:
+      case HiveParser.TOK_CUBE_GROUPBY:
+      case HiveParser.TOK_GROUPING_SETS:
+        // Get the groupby aliases - these are aliased to the entries in the
+        // select list
+        queryProperties.setHasGroupBy(true);
+        if (qbp.getJoinExpr() != null) {
+          queryProperties.setHasJoinFollowedByGroupBy(true);
+        }
+        if (qbp.getSelForClause(ctx_1.dest).getToken().getType() == HiveParser.TOK_SELECTDI) {
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.SELECT_DISTINCT_WITH_GROUPBY.getMsg()));
+        }
+        qbp.setGroupByExprForClause(ctx_1.dest, ast);
+        skipRecursion = true;
+
+        // Rollup and Cubes are syntactic sugar on top of grouping sets
+        if (ast.getToken().getType() == HiveParser.TOK_ROLLUP_GROUPBY) {
+          qbp.getDestRollups().add(ctx_1.dest);
+        } else if (ast.getToken().getType() == HiveParser.TOK_CUBE_GROUPBY) {
+          qbp.getDestCubes().add(ctx_1.dest);
+        } else if (ast.getToken().getType() == HiveParser.TOK_GROUPING_SETS) {
+          qbp.getDestGroupingSets().add(ctx_1.dest);
+        }
+        break;
+
+      case HiveParser.TOK_HAVING:
+        qbp.setHavingExprForClause(ctx_1.dest, ast);
+        qbp.addAggregationExprsForClause(ctx_1.dest,
+            doPhase1GetAggregationsFromSelect(ast, qb, ctx_1.dest));
+        break;
+
+      case HiveParser.KW_WINDOW:
+        if (!qb.hasWindowingSpec(ctx_1.dest) ) {
+          throw new SemanticException(generateErrorMessage(ast,
+              "Query has no Cluster/Distribute By; but has a Window definition"));
+        }
+        handleQueryWindowClauses(qb, ctx_1, ast);
+        break;
+
+      case HiveParser.TOK_LIMIT:
+        if (ast.getChildCount() == 2) {
+          qbp.setDestLimit(ctx_1.dest,
+              new Integer(ast.getChild(0).getText()),
+              new Integer(ast.getChild(1).getText()));
+        } else {
+          qbp.setDestLimit(ctx_1.dest, new Integer(0),
+              new Integer(ast.getChild(0).getText()));
+        }
+        break;
+
+      case HiveParser.TOK_ANALYZE:
+        // Case of analyze command
+
+        String table_name = getUnescapedName((ASTNode) ast.getChild(0).getChild(0)).toLowerCase();
+
+
+        qb.setTabAlias(table_name, table_name);
+        qb.addAlias(table_name);
+        qb.getParseInfo().setIsAnalyzeCommand(true);
+        qb.getParseInfo().setNoScanAnalyzeCommand(this.noscan);
+        qb.getParseInfo().setPartialScanAnalyzeCommand(this.partialscan);
+        // Allow analyze the whole table and dynamic partitions
+        HiveConf.setVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict");
+        HiveConf.setVar(conf, HiveConf.ConfVars.HIVEMAPREDMODE, "nonstrict");
+
+        break;
+
+      case HiveParser.TOK_UNIONALL:
+        if (!qbp.getIsSubQ()) {
+          // this shouldn't happen. The parser should have converted the union to be
+          // contained in a subquery. Just in case, we keep the error as a fallback.
+          throw new SemanticException(generateErrorMessage(ast,
+              ErrorMsg.UNION_NOTIN_SUBQ.getMsg()));
+        }
+        skipRecursion = false;
+        break;
+
+      case HiveParser.TOK_INSERT:
+        ASTNode destination = (ASTNode) ast.getChild(0);
+        Tree tab = destination.getChild(0);
+
+        // Proceed if AST contains partition & If Not Exists
+        if (destination.getChildCount() == 2 &&
+            tab.getChildCount() == 2 &&
+            destination.getChild(1).getType() == HiveParser.TOK_IFNOTEXISTS) {
+          String tableName = tab.getChild(0).getChild(0).getText();
+
+          Tree partitions = tab.getChild(1);
+          int childCount = partitions.getChildCount();
+          HashMap<String, String> partition = new HashMap<String, String>();
+          for (int i = 0; i < childCount; i++) {
+            String partitionName = partitions.getChild(i).getChild(0).getText();
+            Tree pvalue = partitions.getChild(i).getChild(1);
+            if (pvalue == null) {
+              break;
+            }
+            String partitionVal = stripQuotes(pvalue.getText());
+            partition.put(partitionName, partitionVal);
+          }
+          // if it is a dynamic partition throw the exception
+          if (childCount != partition.size()) {
+            throw new SemanticException(ErrorMsg.INSERT_INTO_DYNAMICPARTITION_IFNOTEXISTS
+                .getMsg(partition.toString()));
+          }
+          Table table = null;
+          try {
+            table = this.getTableObjectByName(tableName);
+          } catch (HiveException ex) {
+            throw new SemanticException(ex);
+          }
+          try {
+            Partition parMetaData = db.getPartition(table, partition, false);
+            // Check partition exists if it exists skip the overwrite
+            if (parMetaData != null) {
+              phase1Result = false;
+              skipRecursion = true;
+              LOG.info("Partition already exists so insert into overwrite " +
+                  "skipped for partition : " + parMetaData.toString());
+              break;
+            }
+          } catch (HiveException e) {
+            LOG.info("Error while getting metadata : ", e);
+          }
+          validatePartSpec(table, partition, (ASTNode)tab, conf, false);
+        }
+        skipRecursion = false;
+        break;
+      case HiveParser.TOK_LATERAL_VIEW:
+      case HiveParser.TOK_LATERAL_VIEW_OUTER:
+        // todo: nested LV
+        assert ast.getChildCount() == 1;
+        qb.getParseInfo().getDestToLateralView().put(ctx_1.dest, ast);
+        break;
+      case HiveParser.TOK_CTE:
+        processCTE(qb, ast);
+        break;
+      default:
+        skipRecursion = false;
+        break;
+      }
+    }
+
+    if (!skipRecursion) {
+      // Iterate over the rest of the children
+      int child_count = ast.getChildCount();
+      for (int child_pos = 0; child_pos < child_count && phase1Result; ++child_pos) {
+        // Recurse
+        phase1Result = phase1Result && doPhase1(
+            (ASTNode)ast.getChild(child_pos), qb, ctx_1, plannerCtx);
+      }
+    }
+    return phase1Result;
+  }
+
+  /**
+   * This is phase1 of supporting specifying schema in insert statement
+   * insert into foo(z,y) select a,b from bar;
+   * @see #handleInsertStatementSpec(java.util.List, String, RowResolver, RowResolver, QB, ASTNode)
+   * @throws SemanticException
+   */
+  private void handleInsertStatementSpecPhase1(ASTNode ast, QBParseInfo qbp, Phase1Ctx ctx_1) throws SemanticException {
+    ASTNode tabColName = (ASTNode)ast.getChild(1);
+    if(ast.getType() == HiveParser.TOK_INSERT_INTO && tabColName != null && tabColName.getType() == HiveParser.TOK_TABCOLNAME) {
+      //we have "insert into foo(a,b)..."; parser will enforce that 1+ columns are listed if TOK_TABCOLNAME is present
+      List<String> targetColNames = new ArrayList<String>();
+      for(Node col : tabColName.getChildren()) {
+        assert ((ASTNode)col).getType() == HiveParser.Identifier :
+          "expected token " + HiveParser.Identifier + " found " + ((ASTNode)col).getType();
+        targetColNames.add(((ASTNode)col).getText());
+      }
+      String fullTableName = getUnescapedName((ASTNode) ast.getChild(0).getChild(0),
+        SessionState.get().getCurrentDatabase());
+      qbp.setDestSchemaForClause(ctx_1.dest, targetColNames);
+      Set<String> targetColumns = new HashSet<String>();
+      targetColumns.addAll(targetColNames);
+      if(targetColNames.size() != targetColumns.size()) {
+        throw new SemanticException(generateErrorMessage(tabColName,
+          "Duplicate column name detected in " + fullTableName + " table schema specification"));
+      }
+      Table targetTable = null;
+      try {
+        targetTable = db.getTable(fullTableName, false);
+      }
+      catch (HiveException ex) {
+        LOG.error("Error processing HiveParser.TOK_DESTINATION: " + ex.getMessage(), ex);
+        throw new SemanticException(ex);
+      }
+      if(targetTable == null) {
+        throw new SemanticException(generateErrorMessage(ast,
+          "Unable to access metadata for table " + fullTableName));
+      }
+      for(FieldSchema f : targetTable.getCols()) {
+        //parser only allows foo(a,b), not foo(foo.a, foo.b)
+        targetColumns.remove(f.getName());
+      }
+      if(!targetColumns.isEmpty()) {//here we need to see if remaining columns are dynamic partition columns
+            /* We just checked the user specified schema columns among regular table column and found some which are not
+            'regular'.  Now check is they are dynamic partition columns
+              For dynamic partitioning,
+              Given "create table multipart(a int, b int) partitioned by (c int, d int);"
+              for "insert into multipart partition(c='1',d)(d,a) values(2,3);" we expect parse tree to look like this
+               (TOK_INSERT_INTO
+                (TOK_TAB
+                  (TOK_TABNAME multipart)
+                  (TOK_PARTSPEC
+                    (TOK_PARTVAL c '1')
+                    (TOK_PARTVAL d)
+                  )
+                )
+                (TOK_TABCOLNAME d a)
+               )*/
+        List<String> dynamicPartitionColumns = new ArrayList<String>();
+        if(ast.getChild(0) != null && ast.getChild(0).getType() == HiveParser.TOK_TAB) {
+          ASTNode tokTab = (ASTNode)ast.getChild(0);
+          ASTNode tokPartSpec = (ASTNode)tokTab.getFirstChildWithType(HiveParser.TOK_PARTSPEC);
+          if(tokPartSpec != null) {
+            for(Node n : tokPartSpec.getChildren()) {
+              ASTNode tokPartVal = null;
+              if(n instanceof ASTNode) {
+                tokPartVal = (ASTNode)n;
+              }
+              if(tokPartVal != null && tokPartVal.getType() == HiveParser.TOK_PARTVAL && tokPartVal.getChildCount() == 1) {
+                assert tokPartVal.getChild(0).getType() == HiveParser.Identifier :
+                  "Expected column name; found tokType=" + tokPartVal.getType();
+                dynamicPartitionColumns.add(tokPartVal.getChild(0).getText());
+              }
+            }
+          }
+        }
+        for(String colName : dynamicPartitionColumns) {
+          targetColumns.remove(colName);
+        }
+        if(!targetColumns.isEmpty()) {
+          //Found some columns in user specified schema which are neither regular not dynamic partition columns
+          throw new SemanticException(generateErrorMessage(tabColName,
+            "'" + (targetColumns.size() == 1 ? targetColumns.iterator().next() : targetColumns) +
+              "' in insert schema specification " + (targetColumns.size() == 1 ? "is" : "are") +
+              " not found among regular columns of " +
+              fullTableName + " nor dynamic partition columns."));
+        }
+      }
+    }
+  }
+
+  public void getMaterializationMetadata(QB qb) throws SemanticException {
+    try {
+      gatherCTEReferences(qb, rootClause);
+      int threshold = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_CTE_MATERIALIZE_THRESHOLD);
+      for (CTEClause cte : Sets.newHashSet(aliasToCTEs.values())) {
+        if (threshold >= 0 && cte.reference >= threshold) {
+          cte.materialize = true;
+        }
+      }
+    } catch (HiveException e) {
+      // Has to use full name to make sure it does not conflict with
+      // org.apache.commons.lang.StringUtils
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      if (e instanceof SemanticException) {
+        throw (SemanticException)e;
+      }
+      throw new SemanticException(e.getMessage(), e);
+    }
+  }
+
+  private void gatherCTEReferences(QBExpr qbexpr, CTEClause parent) throws HiveException {
+    if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) {
+      gatherCTEReferences(qbexpr.getQB(), parent);
+    } else {
+      gatherCTEReferences(qbexpr.getQBExpr1(), parent);
+      gatherCTEReferences(qbexpr.getQBExpr2(), parent);
+    }
+  }
+
+  // TODO: check view references, too
+  private void gatherCTEReferences(QB qb, CTEClause current) throws HiveException {
+    for (String alias : qb.getTabAliases()) {
+      String tabName = qb.getTabNameForAlias(alias);
+      String cteName = tabName.toLowerCase();
+
+      CTEClause cte = findCTEFromName(qb, cteName);
+      if (cte != null) {
+        if (ctesExpanded.contains(cteName)) {
+          throw new SemanticException("Recursive cte " + cteName +
+                  " detected (cycle: " + StringUtils.join(ctesExpanded, " -> ") +
+                  " -> " + cteName + ").");
+        }
+        cte.reference++;
+        current.parents.add(cte);
+        if (cte.qbExpr != null) {
+          continue;
+        }
+        cte.qbExpr = new QBExpr(cteName);
+        doPhase1QBExpr(cte.cteNode, cte.qbExpr, qb.getId(), cteName);
+
+        ctesExpanded.add(cteName);
+        gatherCTEReferences(cte.qbExpr, cte);
+        ctesExpanded.remove(ctesExpanded.size() - 1);
+      }
+    }
+    for (String alias : qb.getSubqAliases()) {
+      gatherCTEReferences(qb.getSubqForAlias(alias), current);
+    }
+  }
+
+  public void getMetaData(QB qb) throws SemanticException {
+    getMetaData(qb, false);
+  }
+
+  public void getMetaData(QB qb, boolean enableMaterialization) throws SemanticException {
+    try {
+      if (enableMaterialization) {
+        getMaterializationMetadata(qb);
+      }
+      getMetaData(qb, null);
+    } catch (HiveException e) {
+      // Has to use full name to make sure it does not conflict with
+      // org.apache.commons.lang.StringUtils
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      if (e instanceof SemanticException) {
+        throw (SemanticException)e;
+      }
+      throw new SemanticException(e.getMessage(), e);
+    }
+  }
+
+  private void getMetaData(QBExpr qbexpr, ReadEntity parentInput)
+          throws HiveException {
+    if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) {
+      getMetaData(qbexpr.getQB(), parentInput);
+    } else {
+      getMetaData(qbexpr.getQBExpr1(), parentInput);
+      getMetaData(qbexpr.getQBExpr2(), parentInput);
+    }
+  }
+
+  @SuppressWarnings("nls")
+  private void getMetaData(QB qb, ReadEntity parentInput)
+          throws HiveException {
+    LOG.info("Get metadata for source tables");
+
+    // Go over the tables and populate the related structures.
+    // We have to materialize the table alias list since we might
+    // modify it in the middle for view rewrite.
+    List<String> tabAliases = new ArrayList<String>(qb.getTabAliases());
+
+    // Keep track of view alias to view name and read entity
+    // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T
+    // keeps track of full view name and read entity corresponding to alias V3, V3:V2, V3:V2:V1.
+    // This is needed for tracking the dependencies for inputs, along with their parents.
+    Map<String, ObjectPair<String, ReadEntity>> aliasToViewInfo =
+        new HashMap<String, ObjectPair<String, ReadEntity>>();
+
+    /*
+     * used to capture view to SQ conversions. This is used to check for
+     * recursive CTE invocations.
+     */
+    Map<String, String> sqAliasToCTEName = new HashMap<String, String>();
+
+    for (String alias : tabAliases) {
+      String tabName = qb.getTabNameForAlias(alias);
+      String cteName = tabName.toLowerCase();
+
+      Table tab = db.getTable(tabName, false);
+      if (tab == null ||
+              tab.getDbName().equals(SessionState.get().getCurrentDatabase())) {
+        Table materializedTab = ctx.getMaterializedTable(cteName);
+        if (materializedTab == null) {
+          // we first look for this alias from CTE, and then from catalog.
+          CTEClause cte = findCTEFromName(qb, cteName);
+          if (cte != null) {
+            if (!cte.materialize) {
+              addCTEAsSubQuery(qb, cteName, alias);
+              sqAliasToCTEName.put(alias, cteName);
+              continue;
+            }
+            tab = materializeCTE(cteName, cte);
+          }
+        } else {
+          tab = materializedTab;
+        }
+      }
+
+      if (tab == null) {
+        ASTNode src = qb.getParseInfo().getSrcForAlias(alias);
+        if (null != src) {
+          throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(src));
+        } else {
+          throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(alias));
+        }
+      }
+     if (tab.isView()) {
+        if (qb.getParseInfo().isAnalyzeCommand()) {
+          throw new SemanticException(ErrorMsg.ANALYZE_VIEW.getMsg());
+        }
+        String fullViewName = tab.getDbName() + "." + tab.getTableName();
+        // Prevent view cycles
+        if (viewsExpanded.contains(fullViewName)) {
+          throw new SemanticException("Recursive view " + fullViewName +
+              " detected (cycle: " + StringUtils.join(viewsExpanded, " -> ") +
+              " -> " + fullViewName + ").");
+        }
+        replaceViewReferenceWithDefinition(qb, tab, tabName, alias);
+        // This is the last time we'll see the Table objects for views, so add it to the inputs
+        // now. isInsideView will tell if this view is embedded in another view.
+        // If the view is Inside another view, it should have at least one parent
+        if (qb.isInsideView() && parentInput == null) {
+          parentInput = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput);
+        }
+        ReadEntity viewInput = new ReadEntity(tab, parentInput, !qb.isInsideView());
+        viewInput = PlanUtils.addInput(inputs, viewInput);
+        aliasToViewInfo.put(alias, new ObjectPair<String, ReadEntity>(fullViewName, viewInput));
+        String aliasId = getAliasId(alias, qb);
+        if (aliasId != null) {
+          aliasId = aliasId.replace(SemanticAnalyzer.SUBQUERY_TAG_1, "")
+            .replace(SemanticAnalyzer.SUBQUERY_TAG_2, "");
+        }
+        viewAliasToInput.put(aliasId, viewInput);
+        continue;
+      }
+
+      if (!InputFormat.class.isAssignableFrom(tab.getInputFormatClass())) {
+        throw new SemanticException(generateErrorMessage(
+                qb.getParseInfo().getSrcForAlias(alias),
+                ErrorMsg.INVALID_INPUT_FORMAT_TYPE.getMsg()));
+      }
+
+      qb.getMetaData().setSrcForAlias(alias, tab);
+
+      if (qb.getParseInfo().isAnalyzeCommand()) {
+        // allow partial partition specification for nonscan since noscan is fast.
+        TableSpec ts = new TableSpec(db, conf, (ASTNode) ast.getChild(0), true, this.noscan);
+        if (ts.specType == SpecType.DYNAMIC_PARTITION) { // dynamic partitions
+          try {
+            ts.partitions = db.getPartitionsByNames(ts.tableHandle, ts.partSpec);
+          } catch (HiveException e) {
+            throw new SemanticException(generateErrorMessage(
+                    qb.getParseInfo().getSrcForAlias(alias),
+                    "Cannot get partitions for " + ts.partSpec), e);
+          }
+        }
+        // validate partial scan command
+        QBParseInfo qbpi = qb.getParseInfo();
+        if (qbpi.isPartialScanAnalyzeCommand()) {
+          Class<? extends InputFormat> inputFormatClass = null;
+          switch (ts.specType) {
+            case TABLE_ONLY:
+            case DYNAMIC_PARTITION:
+              inputFormatClass = ts.tableHandle.getInputFormatClass();
+              break;
+            case STATIC_PARTITION:
+              inputFormatClass = ts.partHandle.getInputFormatClass();
+              break;
+            default:
+              assert false;
+          }
+          // throw a HiveException for formats other than rcfile or orcfile.
+          if (!(inputFormatClass.equals(RCFileInputFormat.class) || inputFormatClass
+                  .equals(OrcInputFormat.class))) {
+            throw new SemanticException(ErrorMsg.ANALYZE_TABLE_PARTIALSCAN_NON_RCFILE.getMsg());
+          }
+        }
+
+        tab.setTableSpec(ts);
+        qb.getParseInfo().addTableSpec(alias, ts);
+      }
+
+      ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput);
+      // Temporary tables created during the execution are not the input sources
+      if (!PlanUtils.isValuesTempTable(alias)) {
+        PlanUtils.addInput(inputs,
+                new ReadEntity(tab, parentViewInfo, parentViewInfo == null),mergeIsDirect);
+      }
+    }
+
+    LOG.info("Get metadata for subqueries");
+    // Go over the subqueries and getMetaData for these
+    for (String alias : qb.getSubqAliases()) {
+      boolean wasView = aliasToViewInfo.containsKey(alias);
+      boolean wasCTE = sqAliasToCTEName.containsKey(alias);
+      ReadEntity newParentInput = null;
+      if (wasView) {
+        viewsExpanded.add(aliasToViewInfo.get(alias).getFirst());
+        newParentInput = aliasToViewInfo.get(alias).getSecond();
+      } else if (wasCTE) {
+        ctesExpanded.add(sqAliasToCTEName.get(alias));
+      }

<TRUNCATED>

[3/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

Posted by gu...@apache.org.
http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
index d58f447..83e89af 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
@@ -266,11 +266,14 @@ public class GenTezUtils {
             }
           }
           // This TableScanOperator could be part of semijoin optimization.
-          Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap =
-                  context.parseContext.getRsOpToTsOpMap();
-          for (ReduceSinkOperator rs : rsOpToTsOpMap.keySet()) {
-            if (rsOpToTsOpMap.get(rs) == orig) {
-              rsOpToTsOpMap.put(rs, (TableScanOperator) newRoot);
+          Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo =
+                  context.parseContext.getRsToSemiJoinBranchInfo();
+          for (ReduceSinkOperator rs : rsToSemiJoinBranchInfo.keySet()) {
+            SemiJoinBranchInfo sjInfo = rsToSemiJoinBranchInfo.get(rs);
+            if (sjInfo.getTsOp() == orig) {
+              SemiJoinBranchInfo newSJInfo = new SemiJoinBranchInfo(
+                      (TableScanOperator)newRoot, sjInfo.getIsHint());
+              rsToSemiJoinBranchInfo.put(rs, newSJInfo);
             }
           }
         }
@@ -516,19 +519,18 @@ public class GenTezUtils {
     return EdgeType.SIMPLE_EDGE;
   }
 
-  public static void processDynamicMinMaxPushDownOperator(
+  public static void processDynamicSemiJoinPushDownOperator(
           GenTezProcContext procCtx, RuntimeValuesInfo runtimeValuesInfo,
           ReduceSinkOperator rs)
           throws SemanticException {
-    TableScanOperator ts = procCtx.parseContext.getRsOpToTsOpMap().get(rs);
+    SemiJoinBranchInfo sjInfo = procCtx.parseContext.getRsToSemiJoinBranchInfo().get(rs);
 
     List<BaseWork> rsWorkList = procCtx.childToWorkMap.get(rs);
-    if (ts == null || rsWorkList == null) {
+    if (sjInfo == null || rsWorkList == null) {
       // This happens when the ReduceSink's edge has been removed by cycle
       // detection logic. Nothing to do here.
       return;
     }
-    LOG.debug("ResduceSink " + rs + " to TableScan " + ts);
 
     if (rsWorkList.size() != 1) {
       StringBuilder sb = new StringBuilder();
@@ -541,6 +543,9 @@ public class GenTezUtils {
       throw new SemanticException(rs + " belongs to multiple BaseWorks: " + sb.toString());
     }
 
+    TableScanOperator ts = sjInfo.getTsOp();
+    LOG.debug("ResduceSink " + rs + " to TableScan " + ts);
+
     BaseWork parentWork = rsWorkList.get(0);
     BaseWork childWork = procCtx.rootToWorkMap.get(ts);
 
@@ -611,7 +616,7 @@ public class GenTezUtils {
         skip = true;
       }
     }
-    context.getRsOpToTsOpMap().remove(rs);
+    context.getRsToSemiJoinBranchInfo().remove(rs);
   }
 
   private static class DynamicValuePredicateContext implements NodeProcessorCtx {

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g b/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
index 8e70a46..e110fb3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/HintParser.g
@@ -31,6 +31,7 @@ tokens {
   TOK_MAPJOIN;
   TOK_STREAMTABLE;
   TOK_HINTARGLIST;
+  TOK_LEFTSEMIJOIN;
 }
 
 @header {
@@ -69,6 +70,7 @@ hintItem
 hintName
     :
     KW_MAPJOIN -> TOK_MAPJOIN
+    | KW_SEMI -> TOK_LEFTSEMIJOIN
     | KW_STREAMTABLE -> TOK_STREAMTABLE
     ;
 
@@ -80,4 +82,5 @@ hintArgs
 hintArgName
     :
     Identifier
+    | Number
     ;

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
index 3f9f76c..9a69f90 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
@@ -33,17 +33,7 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
 import org.apache.hadoop.hive.ql.Context;
 import org.apache.hadoop.hive.ql.QueryProperties;
 import org.apache.hadoop.hive.ql.QueryState;
-import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.FetchTask;
-import org.apache.hadoop.hive.ql.exec.JoinOperator;
-import org.apache.hadoop.hive.ql.exec.ListSinkOperator;
-import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.Operator;
-import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
-import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.SelectOperator;
-import org.apache.hadoop.hive.ql.exec.TableScanOperator;
-import org.apache.hadoop.hive.ql.exec.Task;
+import org.apache.hadoop.hive.ql.exec.*;
 import org.apache.hadoop.hive.ql.hooks.LineageInfo;
 import org.apache.hadoop.hive.ql.hooks.ReadEntity;
 import org.apache.hadoop.hive.ql.metadata.Table;
@@ -126,11 +116,10 @@ public class ParseContext {
   private boolean needViewColumnAuthorization;
   private Set<FileSinkDesc> acidFileSinks = Collections.emptySet();
 
-  // Map to store mapping between reduce sink Operator and TS Operator for semijoin
-  private Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap =
-          new HashMap<ReduceSinkOperator, TableScanOperator>();
   private Map<ReduceSinkOperator, RuntimeValuesInfo> rsToRuntimeValuesInfo =
           new HashMap<ReduceSinkOperator, RuntimeValuesInfo>();
+  private Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo =
+          new HashMap<>();
 
   public ParseContext() {
   }
@@ -666,11 +655,11 @@ public class ParseContext {
     return rsToRuntimeValuesInfo;
   }
 
-  public void setRsOpToTsOpMap(Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap) {
-    this.rsOpToTsOpMap = rsOpToTsOpMap;
+  public void setRsToSemiJoinBranchInfo(Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo) {
+    this.rsToSemiJoinBranchInfo = rsToSemiJoinBranchInfo;
   }
 
-  public Map<ReduceSinkOperator, TableScanOperator> getRsOpToTsOpMap() {
-    return rsOpToTsOpMap;
+  public Map<ReduceSinkOperator, SemiJoinBranchInfo> getRsToSemiJoinBranchInfo() {
+    return rsToSemiJoinBranchInfo;
   }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
index ec76fb7..bcef252 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBJoinTree.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.hive.ql.parse;
 
+import java.util.Arrays;
+
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -82,6 +84,7 @@ public class QBJoinTree implements Serializable, Cloneable {
    * We then add a Filter Operator after the Join Operator for this QBJoinTree.
    */
   private final List<ASTNode> postJoinFilters;
+  private Map<String, SemiJoinHint> semiJoinHint;
 
   /**
    * constructor.
@@ -429,4 +432,17 @@ public class QBJoinTree implements Serializable, Cloneable {
 
     return cloned;
   }
+
+  public void setSemiJoinHint(Map<String, SemiJoinHint> semiJoinHint) {
+    this.semiJoinHint = semiJoinHint;
+  }
+
+  public Map<String, SemiJoinHint> getSemiJoinHint() {
+    return semiJoinHint;
+  }
+
+  @Override
+  public String toString() {
+    return "QBJoinTree [leftAlias=" + leftAlias + ", rightAliases=" + Arrays.toString(rightAliases) + ", leftAliases=" + Arrays.toString(leftAliases) + ", semiJoinHint=" + semiJoinHint + "]";
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index b5a5645..e4ca25b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -8122,6 +8122,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
 
     JoinDesc desc = new JoinDesc(exprMap, outputColumnNames,
         join.getNoOuterJoin(), joinCondns, filterMap, joinKeys);
+    desc.setSemiJoinHints(join.getSemiJoinHint());
     desc.setReversedExprs(reversedExprs);
     desc.setFilterMap(join.getFilterMap());
     // For outer joins, add filters that apply to more than one input
@@ -8669,6 +8670,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
       parseStreamTables(joinTree, qb);
     }
 
+    if (qb.getParseInfo().getHints() != null) {
+      // TODO: do we need this for unique join?
+      joinTree.setSemiJoinHint(parseSemiJoinHint(qb.getParseInfo().getHints()));
+    }
     return joinTree;
   }
 
@@ -8967,6 +8972,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
       if ((conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) == false) {
         parseStreamTables(joinTree, qb);
       }
+
+      joinTree.setSemiJoinHint(parseSemiJoinHint(qb.getParseInfo().getHints()));
     }
 
     return joinTree;
@@ -9014,6 +9021,62 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
     joinTree.setStreamAliases(streamAliases);
   }
 
+  /** Parses semjoin hints in the query and returns the table names mapped to filter size, or -1 if not specified.
+   *  Hints can be in 3 formats
+   *  1. TableName, ColumnName, bloom filter entries
+   *  2. TableName, bloom filter entries, and
+   *  3. TableName, ColumnName
+   *  */
+  public Map<String, SemiJoinHint> parseSemiJoinHint(ASTNode hints) throws SemanticException {
+    if (hints == null) return null;
+    Map<String, SemiJoinHint> result = null;
+    for (Node hintNode : hints.getChildren()) {
+      ASTNode hint = (ASTNode) hintNode;
+      if (hint.getChild(0).getType() != HintParser.TOK_LEFTSEMIJOIN) continue;
+      if (result == null) {
+        result = new HashMap<>();
+      }
+      String alias = null;
+      String colName = null;
+      Tree args = hint.getChild(1);
+      for (int i = 0; i < args.getChildCount(); i++) {
+        // We can have table names, column names or sizes here (or incorrect hint if the user is so inclined).
+        String text = args.getChild(i).getText();
+        Integer number = null;
+        try {
+          number = Integer.parseInt(text);
+        } catch (NumberFormatException ex) { // Ignore.
+        }
+        if (number != null) {
+          if (alias == null) {
+            throw new SemanticException("Invalid semijoin hint - arg " + i + " ("
+                + text + ") is a number but the previous one is not an alias");
+          }
+          SemiJoinHint sjHint = new SemiJoinHint(alias, colName, number);
+          result.put(alias, sjHint);
+          alias = null;
+          colName = null;
+        } else {
+          if (alias == null) {
+            alias = text;
+          } else if (colName == null ){
+            colName = text;
+          } else {
+            // No bloom filter entries provided.
+            SemiJoinHint sjHint = new SemiJoinHint(alias, colName, null);
+            result.put(alias, sjHint);
+            alias = text;
+            colName = null;
+          }
+        }
+      }
+    }
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Semijoin hint parsed: " + result);
+    }
+    return result;
+  }
+
   /**
    * Merges node to target
    */