You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2016/12/16 18:28:31 UTC
[18/21] hive git commit: HIVE-15192 : Use Calcite to de-correlate and
plan subqueries (Vineet Garg via Ashutosh Chauhan)
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
new file mode 100644
index 0000000..5f58aae
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
@@ -0,0 +1,329 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
+
+import org.apache.calcite.plan.RelOptRule;
+import org.apache.calcite.plan.RelOptRuleCall;
+import org.apache.calcite.plan.RelOptRuleOperand;
+import org.apache.calcite.plan.RelOptUtil;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.CorrelationId;
+import org.apache.calcite.rel.core.Filter;
+import org.apache.calcite.rel.core.JoinRelType;
+import org.apache.calcite.rel.core.RelFactories;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.rex.LogicVisitor;
+import org.apache.calcite.rex.RexInputRef;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.rex.RexShuttle;
+import org.apache.calcite.rex.RexSubQuery;
+import org.apache.calcite.rex.RexUtil;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.type.SqlTypeName;
+import org.apache.calcite.tools.RelBuilderFactory;
+import org.apache.calcite.util.ImmutableBitSet;
+import org.apache.calcite.util.Pair;
+
+import com.google.common.collect.ImmutableList;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveSubQRemoveRelBuilder;
+
+/**
+ * NOTE: this rule is replicated from Calcite's SubqueryRemoveRule
+ * Transform that converts IN, EXISTS and scalar sub-queries into joins.
+ * TODO:
+ * Reason this is replicated instead of using Calcite's is
+ * Calcite creates null literal with null type but hive needs it to be properly typed
+ * Need fix for Calcite-1493
+ *
+ * <p>Sub-queries are represented by {@link RexSubQuery} expressions.
+ *
+ * <p>A sub-query may or may not be correlated. If a sub-query is correlated,
+ * the wrapped {@link RelNode} will contain a {@link RexCorrelVariable} before
+ * the rewrite, and the product of the rewrite will be a {@link Correlate}.
+ * The Correlate can be removed using {@link RelDecorrelator}.
+ */
+public abstract class HiveSubQueryRemoveRule extends RelOptRule{
+
+ public static final HiveSubQueryRemoveRule FILTER =
+ new HiveSubQueryRemoveRule(
+ operand(Filter.class, null, RexUtil.SubQueryFinder.FILTER_PREDICATE,
+ any()),
+ HiveRelFactories.HIVE_BUILDER, "SubQueryRemoveRule:Filter") {
+ public void onMatch(RelOptRuleCall call) {
+ final Filter filter = call.rel(0);
+ //final RelBuilder builder = call.builder();
+ final HiveSubQRemoveRelBuilder builder = new HiveSubQRemoveRelBuilder(null, call.rel(0).getCluster(), null);
+ final RexSubQuery e =
+ RexUtil.SubQueryFinder.find(filter.getCondition());
+ assert e != null;
+ final RelOptUtil.Logic logic =
+ LogicVisitor.find(RelOptUtil.Logic.TRUE,
+ ImmutableList.of(filter.getCondition()), e);
+ builder.push(filter.getInput());
+ final int fieldCount = builder.peek().getRowType().getFieldCount();
+ final RexNode target = apply(e, filter.getVariablesSet(), logic,
+ builder, 1, fieldCount);
+ final RexShuttle shuttle = new ReplaceSubQueryShuttle(e, target);
+ builder.filter(shuttle.apply(filter.getCondition()));
+ builder.project(fields(builder, filter.getRowType().getFieldCount()));
+ call.transformTo(builder.build());
+ }
+ };
+
+ private HiveSubQueryRemoveRule(RelOptRuleOperand operand,
+ RelBuilderFactory relBuilderFactory,
+ String description) {
+ super(operand, relBuilderFactory, description);
+ }
+
+ protected RexNode apply(RexSubQuery e, Set<CorrelationId> variablesSet,
+ RelOptUtil.Logic logic,
+ HiveSubQRemoveRelBuilder builder, int inputCount, int offset) {
+ switch (e.getKind()) {
+ case SCALAR_QUERY:
+ builder.push(e.rel);
+ final RelMetadataQuery mq = RelMetadataQuery.instance();
+ final Boolean unique = mq.areColumnsUnique(builder.peek(),
+ ImmutableBitSet.of());
+ if (unique == null || !unique) {
+ builder.aggregate(builder.groupKey(),
+ builder.aggregateCall(SqlStdOperatorTable.SINGLE_VALUE, false, null,
+ null, builder.field(0)));
+ }
+ builder.join(JoinRelType.LEFT, builder.literal(true), variablesSet);
+ return field(builder, inputCount, offset);
+
+ case IN:
+ case EXISTS:
+ // Most general case, where the left and right keys might have nulls, and
+ // caller requires 3-valued logic return.
+ //
+ // select e.deptno, e.deptno in (select deptno from emp)
+ //
+ // becomes
+ //
+ // select e.deptno,
+ // case
+ // when ct.c = 0 then false
+ // when dt.i is not null then true
+ // when e.deptno is null then null
+ // when ct.ck < ct.c then null
+ // else false
+ // end
+ // from e
+ // left join (
+ // (select count(*) as c, count(deptno) as ck from emp) as ct
+ // cross join (select distinct deptno, true as i from emp)) as dt
+ // on e.deptno = dt.deptno
+ //
+ // If keys are not null we can remove "ct" and simplify to
+ //
+ // select e.deptno,
+ // case
+ // when dt.i is not null then true
+ // else false
+ // end
+ // from e
+ // left join (select distinct deptno, true as i from emp) as dt
+ // on e.deptno = dt.deptno
+ //
+ // We could further simplify to
+ //
+ // select e.deptno,
+ // dt.i is not null
+ // from e
+ // left join (select distinct deptno, true as i from emp) as dt
+ // on e.deptno = dt.deptno
+ //
+ // but have not yet.
+ //
+ // If the logic is TRUE we can just kill the record if the condition
+ // evaluates to FALSE or UNKNOWN. Thus the query simplifies to an inner
+ // join:
+ //
+ // select e.deptno,
+ // true
+ // from e
+ // inner join (select distinct deptno from emp) as dt
+ // on e.deptno = dt.deptno
+ //
+
+ builder.push(e.rel);
+ final List<RexNode> fields = new ArrayList<>();
+ switch (e.getKind()) {
+ case IN:
+ fields.addAll(builder.fields());
+ }
+
+ // First, the cross join
+ switch (logic) {
+ case TRUE_FALSE_UNKNOWN:
+ case UNKNOWN_AS_TRUE:
+ // Since EXISTS/NOT EXISTS are not affected by presence of
+ // null keys we do not need to generate count(*), count(c)
+ if (e.getKind() == SqlKind.EXISTS) {
+ logic = RelOptUtil.Logic.TRUE_FALSE;
+ break;
+ }
+ builder.aggregate(builder.groupKey(),
+ builder.count(false, "c"),
+ builder.aggregateCall(SqlStdOperatorTable.COUNT, false, null, "ck",
+ builder.fields()));
+ builder.as("ct");
+ if( !variablesSet.isEmpty())
+ {
+ //builder.join(JoinRelType.INNER, builder.literal(true), variablesSet);
+ builder.join(JoinRelType.LEFT, builder.literal(true), variablesSet);
+ }
+ else
+ builder.join(JoinRelType.INNER, builder.literal(true), variablesSet);
+
+ offset += 2;
+ builder.push(e.rel);
+ break;
+ }
+
+ // Now the left join
+ switch (logic) {
+ case TRUE:
+ if (fields.isEmpty()) {
+ builder.project(builder.alias(builder.literal(true), "i"));
+ builder.aggregate(builder.groupKey(0));
+ } else {
+ builder.aggregate(builder.groupKey(fields));
+ }
+ break;
+ default:
+ fields.add(builder.alias(builder.literal(true), "i"));
+ builder.project(fields);
+ builder.distinct();
+ }
+ builder.as("dt");
+ final List<RexNode> conditions = new ArrayList<>();
+ for (Pair<RexNode, RexNode> pair
+ : Pair.zip(e.getOperands(), builder.fields())) {
+ conditions.add(
+ builder.equals(pair.left, RexUtil.shift(pair.right, offset)));
+ }
+ switch (logic) {
+ case TRUE:
+ builder.join(JoinRelType.INNER, builder.and(conditions), variablesSet);
+ return builder.literal(true);
+ }
+ builder.join(JoinRelType.LEFT, builder.and(conditions), variablesSet);
+
+ final List<RexNode> keyIsNulls = new ArrayList<>();
+ for (RexNode operand : e.getOperands()) {
+ if (operand.getType().isNullable()) {
+ keyIsNulls.add(builder.isNull(operand));
+ }
+ }
+ final ImmutableList.Builder<RexNode> operands = ImmutableList.builder();
+ switch (logic) {
+ case TRUE_FALSE_UNKNOWN:
+ case UNKNOWN_AS_TRUE:
+ operands.add(
+ builder.equals(builder.field("ct", "c"), builder.literal(0)),
+ builder.literal(false));
+ //now that we are using LEFT OUTER JOIN to join inner count, count(*)
+ // with outer table, we wouldn't be able to tell if count is zero
+ // for inner table since inner join with correlated values will get rid
+ // of all values where join cond is not true (i.e where actual inner table
+ // will produce zero result). To handle this case we need to check both
+ // count is zero or count is null
+ operands.add((builder.isNull(builder.field("ct", "c"))), builder.literal(false));
+ break;
+ }
+ operands.add(builder.isNotNull(builder.field("dt", "i")),
+ builder.literal(true));
+ if (!keyIsNulls.isEmpty()) {
+ //Calcite creates null literal with Null type here but because HIVE doesn't support null type
+ // it is appropriately typed boolean
+ operands.add(builder.or(keyIsNulls), e.rel.getCluster().getRexBuilder().makeNullLiteral(SqlTypeName.BOOLEAN));
+ // we are creating filter here so should not be returning NULL. Not sure why Calcite return NULL
+ //operands.add(builder.or(keyIsNulls), builder.literal(false));
+ }
+ Boolean b = true;
+ switch (logic) {
+ case TRUE_FALSE_UNKNOWN:
+ b = null;
+ // fall through
+ case UNKNOWN_AS_TRUE:
+ operands.add(
+ builder.call(SqlStdOperatorTable.LESS_THAN,
+ builder.field("ct", "ck"), builder.field("ct", "c")),
+ builder.literal(b));
+ break;
+ }
+ operands.add(builder.literal(false));
+ return builder.call(SqlStdOperatorTable.CASE, operands.build());
+
+ default:
+ throw new AssertionError(e.getKind());
+ }
+ }
+
+ /** Returns a reference to a particular field, by offset, across several
+ * inputs on a {@link RelBuilder}'s stack. */
+ private RexInputRef field(HiveSubQRemoveRelBuilder builder, int inputCount, int offset) {
+ for (int inputOrdinal = 0;;) {
+ final RelNode r = builder.peek(inputCount, inputOrdinal);
+ if (offset < r.getRowType().getFieldCount()) {
+ return builder.field(inputCount, inputOrdinal, offset);
+ }
+ ++inputOrdinal;
+ offset -= r.getRowType().getFieldCount();
+ }
+ }
+
+ /** Returns a list of expressions that project the first {@code fieldCount}
+ * fields of the top input on a {@link RelBuilder}'s stack. */
+ private static List<RexNode> fields(HiveSubQRemoveRelBuilder builder, int fieldCount) {
+ final List<RexNode> projects = new ArrayList<>();
+ for (int i = 0; i < fieldCount; i++) {
+ projects.add(builder.field(i));
+ }
+ return projects;
+ }
+
+ /** Shuttle that replaces occurrences of a given
+ * {@link org.apache.calcite.rex.RexSubQuery} with a replacement
+ * expression. */
+ private static class ReplaceSubQueryShuttle extends RexShuttle {
+ private final RexSubQuery subQuery;
+ private final RexNode replacement;
+
+ public ReplaceSubQueryShuttle(RexSubQuery subQuery, RexNode replacement) {
+ this.subQuery = subQuery;
+ this.replacement = replacement;
+ }
+
+ @Override public RexNode visitSubQuery(RexSubQuery subQuery) {
+ return RexUtil.eq(subQuery, this.subQuery) ? replacement : subQuery;
+ }
+ }
+}
+
+// End SubQueryRemoveRule.java
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
index d494c9f..73a9b0f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java
@@ -266,6 +266,11 @@ public class HiveOpConverter {
TableScanOperator ts = (TableScanOperator) OperatorFactory.get(
semanticAnalyzer.getOpContext(), tsd, new RowSchema(colInfos));
+ //now that we let Calcite process subqueries we might have more than one
+ // tablescan with same alias.
+ if(topOps.get(tableAlias) != null) {
+ tableAlias = tableAlias + this.uniqueCounter ;
+ }
topOps.put(tableAlias, ts);
if (LOG.isDebugEnabled()) {
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
index 679cec1..d36eb0b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
@@ -32,12 +32,14 @@ import org.apache.calcite.avatica.util.TimeUnit;
import org.apache.calcite.avatica.util.TimeUnitRange;
import org.apache.calcite.plan.RelOptCluster;
import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.CorrelationId;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexCall;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.rex.RexUtil;
+import org.apache.calcite.rex.RexSubQuery;
import org.apache.calcite.sql.SqlCollation;
import org.apache.calcite.sql.SqlIntervalQualifier;
import org.apache.calcite.sql.SqlKind;
@@ -54,6 +56,7 @@ import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
import org.apache.hadoop.hive.common.type.HiveVarchar;
+import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature;
@@ -68,6 +71,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeSubQueryDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseBinary;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare;
@@ -117,22 +121,43 @@ public class RexNodeConverter {
private final ImmutableList<InputCtx> inputCtxs;
private final boolean flattenExpr;
+ //outerRR belongs to outer query and is required to resolve correlated references
+ private final RowResolver outerRR;
+ private final ImmutableMap<String, Integer> outerNameToPosMap;
+ private int correlatedId;
+
//Constructor used by HiveRexExecutorImpl
public RexNodeConverter(RelOptCluster cluster) {
this(cluster, new ArrayList<InputCtx>(), false);
}
+ //subqueries will need outer query's row resolver
+ public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType,
+ ImmutableMap<String, Integer> outerNameToPosMap,
+ ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR, int offset, boolean flattenExpr, int correlatedId) {
+ this.cluster = cluster;
+ this.inputCtxs = ImmutableList.of(new InputCtx(inpDataType, nameToPosMap, hiveRR , offset));
+ this.flattenExpr = flattenExpr;
+ this.outerRR = outerRR;
+ this.outerNameToPosMap = outerNameToPosMap;
+ this.correlatedId = correlatedId;
+ }
+
public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType,
ImmutableMap<String, Integer> nameToPosMap, int offset, boolean flattenExpr) {
this.cluster = cluster;
this.inputCtxs = ImmutableList.of(new InputCtx(inpDataType, nameToPosMap, null, offset));
this.flattenExpr = flattenExpr;
+ this.outerRR = null;
+ this.outerNameToPosMap = null;
}
public RexNodeConverter(RelOptCluster cluster, List<InputCtx> inpCtxLst, boolean flattenExpr) {
this.cluster = cluster;
this.inputCtxs = ImmutableList.<InputCtx> builder().addAll(inpCtxLst).build();
this.flattenExpr = flattenExpr;
+ this.outerRR = null;
+ this.outerNameToPosMap = null;
}
public RexNode convert(ExprNodeDesc expr) throws SemanticException {
@@ -144,12 +169,42 @@ public class RexNodeConverter {
return convert((ExprNodeColumnDesc) expr);
} else if (expr instanceof ExprNodeFieldDesc) {
return convert((ExprNodeFieldDesc) expr);
+ } else if(expr instanceof ExprNodeSubQueryDesc) {
+ return convert((ExprNodeSubQueryDesc) expr);
} else {
throw new RuntimeException("Unsupported Expression");
}
// TODO: handle ExprNodeColumnListDesc
}
+ private RexNode convert(final ExprNodeSubQueryDesc subQueryDesc) throws SemanticException {
+ if(subQueryDesc.getType() == ExprNodeSubQueryDesc.SubqueryType.IN) {
+ /*
+ * Check.5.h :: For In and Not In the SubQuery must implicitly or
+ * explicitly only contain one select item.
+ */
+ if(subQueryDesc.getRexSubQuery().getRowType().getFieldCount() > 1) {
+ throw new SemanticException(ErrorMsg.INVALID_SUBQUERY_EXPRESSION.getMsg(
+ "SubQuery can contain only 1 item in Select List."));
+ }
+ //create RexNode for LHS
+ RexNode rexNodeLhs = convert(subQueryDesc.getSubQueryLhs());
+
+ //create RexSubQuery node
+ RexNode rexSubQuery = RexSubQuery.in(subQueryDesc.getRexSubQuery(),
+ ImmutableList.<RexNode>of(rexNodeLhs) );
+ return rexSubQuery;
+ }
+ else if( subQueryDesc.getType() == ExprNodeSubQueryDesc.SubqueryType.EXISTS) {
+ RexNode subQueryNode = RexSubQuery.exists(subQueryDesc.getRexSubQuery());
+ return subQueryNode;
+ }
+ else {
+ throw new SemanticException(ErrorMsg.INVALID_SUBQUERY_EXPRESSION.getMsg(
+ "Currently only IN and EXISTS type of subqueries are supported"));
+ }
+ }
+
private RexNode convert(final ExprNodeFieldDesc fieldDesc) throws SemanticException {
RexNode rexNode = convert(fieldDesc.getDesc());
if (rexNode.getType().isStruct()) {
@@ -420,7 +475,7 @@ public class RexNodeConverter {
private InputCtx getInputCtx(ExprNodeColumnDesc col) throws SemanticException {
InputCtx ctxLookingFor = null;
- if (inputCtxs.size() == 1) {
+ if (inputCtxs.size() == 1 && inputCtxs.get(0).hiveRR == null) {
ctxLookingFor = inputCtxs.get(0);
} else {
String tableAlias = col.getTabAlias();
@@ -443,7 +498,21 @@ public class RexNodeConverter {
}
protected RexNode convert(ExprNodeColumnDesc col) throws SemanticException {
+ //if this is co-rrelated we need to make RexCorrelVariable(with id and type)
+ // id and type should be retrieved from outerRR
InputCtx ic = getInputCtx(col);
+ if(ic == null) {
+ // we have correlated column, build data type from outer rr
+ RelDataType rowType = TypeConverter.getType(cluster, this.outerRR, null);
+ if (this.outerNameToPosMap.get(col.getColumn()) == null) {
+ throw new SemanticException(ErrorMsg.INVALID_COLUMN_NAME.getMsg(col.getColumn()));
+ }
+
+ int pos = this.outerNameToPosMap.get(col.getColumn());
+ CorrelationId colCorr = new CorrelationId(this.correlatedId);
+ RexNode corExpr = cluster.getRexBuilder().makeCorrel(rowType, colCorr);
+ return cluster.getRexBuilder().makeFieldAccess(corExpr, pos);
+ }
int pos = ic.hiveNameToPosMap.get(col.getColumn());
return cluster.getRexBuilder().makeInputRef(
ic.calciteInpDataType.getFieldList().get(pos).getType(), pos + ic.offsetInCalciteSchema);
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index 07155fd..278f5b6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -24,9 +24,11 @@ import java.lang.reflect.UndeclaredThrowableException;
import java.math.BigDecimal;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
+import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
+import java.util.Deque;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
@@ -109,6 +111,7 @@ import org.apache.calcite.util.CompositeList;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.calcite.util.ImmutableIntList;
import org.apache.calcite.util.Pair;
+import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
@@ -136,6 +139,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveDefaultRelMetadataProvider;
import org.apache.hadoop.hive.ql.optimizer.calcite.HivePlannerContext;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelDecorrelator;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRexExecutorImpl;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl;
import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
@@ -192,6 +196,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortMergeRule;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortProjectTransposeRule;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortRemoveRule;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortUnionReduceRule;
+import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSubQueryRemoveRule;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveUnionPullUpConstantsRule;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveWindowingFixRule;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.views.HiveMaterializedViewFilterScanRule;
@@ -208,7 +213,6 @@ import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec;
import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression;
import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec;
import org.apache.hadoop.hive.ql.parse.QBExpr.Opcode;
-import org.apache.hadoop.hive.ql.parse.QBSubQuery.SubQueryType;
import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec;
import org.apache.hadoop.hive.ql.parse.WindowingSpec.RangeBoundarySpec;
import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec;
@@ -242,6 +246,7 @@ import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
+import com.google.common.math.IntMath;
public class CalcitePlanner extends SemanticAnalyzer {
@@ -332,6 +337,10 @@ public class CalcitePlanner extends SemanticAnalyzer {
boolean reAnalyzeAST = false;
final boolean materializedView = getQB().isMaterializedView();
+ // currently semi-join optimization doesn't work with subqueries
+ // so this will be turned off for if we find subqueries and will later be
+ // restored to its original state
+ boolean originalSemiOptVal = this.conf.getBoolVar(ConfVars.SEMIJOIN_CONVERSION);
try {
if (this.conf.getBoolVar(HiveConf.ConfVars.HIVE_CBO_RETPATH_HIVEOP)) {
sinkOp = getOptimizedHiveOPDag();
@@ -431,6 +440,8 @@ public class CalcitePlanner extends SemanticAnalyzer {
super.genResolvedParseTree(ast, new PlannerContext());
skipCalcitePlan = true;
}
+ // restore semi-join opt flag
+ this.conf.setBoolVar(ConfVars.SEMIJOIN_CONVERSION, originalSemiOptVal);
}
} else {
this.ctx.setCboInfo("Plan not optimized by CBO.");
@@ -1000,6 +1011,10 @@ public class CalcitePlanner extends SemanticAnalyzer {
private final ColumnAccessInfo columnAccessInfo;
private Map<HiveProject, Table> viewProjectToTableSchema;
+ //correlated vars across subqueries within same query needs to have different ID
+ // this will be used in RexNodeConverter to create cor var
+ private int subqueryId;
+
// TODO: Do we need to keep track of RR, ColNameToPosMap for every op or
// just last one.
LinkedHashMap<RelNode, RowResolver> relToHiveRR = new LinkedHashMap<RelNode, RowResolver>();
@@ -1015,6 +1030,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
RelNode calciteGenPlan = null;
RelNode calcitePreCboPlan = null;
RelNode calciteOptimizedPlan = null;
+ subqueryId = -1;
/*
* recreate cluster, so that it picks up the additional traitDef
@@ -1038,7 +1054,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
// 1. Gen Calcite Plan
perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.OPTIMIZER);
try {
- calciteGenPlan = genLogicalPlan(getQB(), true);
+ calciteGenPlan = genLogicalPlan(getQB(), true, null, null);
resultSchema = SemanticAnalyzer.convertRowSchemaToResultSetSchema(
relToHiveRR.get(calciteGenPlan),
HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
@@ -1062,6 +1078,15 @@ public class CalcitePlanner extends SemanticAnalyzer {
// Create executor
Executor executorProvider = new HiveRexExecutorImpl(optCluster);
+ //Remove subquery
+ LOG.debug("Plan before removing subquery:\n" + RelOptUtil.toString(calciteGenPlan));
+ calciteGenPlan = hepPlan(calciteGenPlan, false, mdProvider.getMetadataProvider(), null,
+ HiveSubQueryRemoveRule.FILTER);
+ LOG.debug("Plan just after removing subquery:\n" + RelOptUtil.toString(calciteGenPlan));
+
+ calciteGenPlan = HiveRelDecorrelator.decorrelateQuery(calciteGenPlan);
+ LOG.debug("Plan after decorrelation:\n" + RelOptUtil.toString(calciteGenPlan));
+
// 2. Apply pre-join order optimizations
calcitePreCboPlan = applyPreJoinOrderingTransforms(calciteGenPlan,
mdProvider.getMetadataProvider(), executorProvider);
@@ -2018,8 +2043,10 @@ public class CalcitePlanner extends SemanticAnalyzer {
}
private RelNode genFilterRelNode(ASTNode filterExpr, RelNode srcRel,
+ ImmutableMap<String, Integer> outerNameToPosMap, RowResolver outerRR,
boolean useCaching) throws SemanticException {
- ExprNodeDesc filterCondn = genExprNodeDesc(filterExpr, relToHiveRR.get(srcRel), useCaching);
+ ExprNodeDesc filterCondn = genExprNodeDesc(filterExpr, relToHiveRR.get(srcRel),
+ outerRR, null, useCaching);
if (filterCondn instanceof ExprNodeConstantDesc
&& !filterCondn.getTypeString().equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
// queries like select * from t1 where 'foo';
@@ -2035,7 +2062,8 @@ public class CalcitePlanner extends SemanticAnalyzer {
ImmutableMap<String, Integer> hiveColNameCalcitePosMap = this.relToHiveColNameCalcitePosMap
.get(srcRel);
RexNode convertedFilterExpr = new RexNodeConverter(cluster, srcRel.getRowType(),
- hiveColNameCalcitePosMap, 0, true).convert(filterCondn);
+ outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), outerRR,
+ 0, true, subqueryId).convert(filterCondn);
RexNode factoredFilterExpr = RexUtil
.pullFactors(cluster.getRexBuilder(), convertedFilterExpr);
RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
@@ -2047,66 +2075,46 @@ public class CalcitePlanner extends SemanticAnalyzer {
return filterRel;
}
- private RelNode genFilterRelNode(QB qb, ASTNode searchCond, RelNode srcRel,
- Map<String, RelNode> aliasToRel, boolean forHavingClause) throws SemanticException {
- /*
- * Handle Subquery predicates.
- *
- * Notes (8/22/14 hb): Why is this a copy of the code from {@link
- * #genFilterPlan} - for now we will support the same behavior as non CBO
- * route. - but plan to allow nested SubQueries(Restriction.9.m) and
- * multiple SubQuery expressions(Restriction.8.m). This requires use to
- * utilize Calcite's Decorrelation mechanics, and for Calcite to fix/flush
- * out Null semantics(CALCITE-373) - besides only the driving code has
- * been copied. Most of the code which is SubQueryUtils and QBSubQuery is
- * reused.
- */
- int numSrcColumns = srcRel.getRowType().getFieldCount();
- List<ASTNode> subQueriesInOriginalTree = SubQueryUtils.findSubQueries(searchCond);
- if (subQueriesInOriginalTree.size() > 0) {
+ private void subqueryRestritionCheck(QB qb, ASTNode searchCond, RelNode srcRel,
+ boolean forHavingClause, Map<String, RelNode> aliasToRel ) throws SemanticException {
+ List<ASTNode> subQueriesInOriginalTree = SubQueryUtils.findSubQueries(searchCond);
+ if (subQueriesInOriginalTree.size() > 0) {
/*
* Restriction.9.m :: disallow nested SubQuery expressions.
*/
- if (qb.getSubQueryPredicateDef() != null) {
- throw new SemanticException(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(
- subQueriesInOriginalTree.get(0), "Nested SubQuery expressions are not supported."));
- }
+ if (qb.getSubQueryPredicateDef() != null) {
+ throw new SemanticException(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(
+ subQueriesInOriginalTree.get(0),
+ "Nested SubQuery expressions are not supported."));
+ }
/*
* Restriction.8.m :: We allow only 1 SubQuery expression per Query.
*/
- if (subQueriesInOriginalTree.size() > 1) {
+ if (subQueriesInOriginalTree.size() > 1) {
- throw new SemanticException(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(
- subQueriesInOriginalTree.get(1), "Only 1 SubQuery expression is supported."));
- }
-
- /*
- * Clone the Search AST; apply all rewrites on the clone.
- */
- ASTNode clonedSearchCond = (ASTNode) SubQueryUtils.adaptor.dupTree(searchCond);
- List<ASTNode> subQueries = SubQueryUtils.findSubQueries(clonedSearchCond);
-
- RowResolver inputRR = relToHiveRR.get(srcRel);
- RowResolver outerQBRR = inputRR;
- ImmutableMap<String, Integer> outerQBPosMap = relToHiveColNameCalcitePosMap.get(srcRel);
-
- for (int i = 0; i < subQueries.size(); i++) {
- ASTNode subQueryAST = subQueries.get(i);
- ASTNode originalSubQueryAST = subQueriesInOriginalTree.get(i);
+ throw new SemanticException(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(
+ subQueriesInOriginalTree.get(1), "Only 1 SubQuery expression is supported."));
+ }
+ //we do not care about the transformation or rewriting of AST
+ // which following statement does
+ // we only care about the restriction checks they perform.
+ // We plan to get rid of these restrictions later
int sqIdx = qb.incrNumSubQueryPredicates();
+ ASTNode originalSubQueryAST = subQueriesInOriginalTree.get(0);
+
+ ASTNode clonedSearchCond = (ASTNode) SubQueryUtils.adaptor.dupTree(searchCond);
+ List<ASTNode> subQueries = SubQueryUtils.findSubQueries(clonedSearchCond);
+ ASTNode subQueryAST = subQueries.get(0);
clonedSearchCond = SubQueryUtils.rewriteParentQueryWhere(clonedSearchCond, subQueryAST);
QBSubQuery subQuery = SubQueryUtils.buildSubQuery(qb.getId(), sqIdx, subQueryAST,
- originalSubQueryAST, ctx);
+ originalSubQueryAST, ctx);
+
+ RowResolver inputRR = relToHiveRR.get(srcRel);
- if (!forHavingClause) {
- qb.setWhereClauseSubQueryPredicate(subQuery);
- } else {
- qb.setHavingClauseSubQueryPredicate(subQuery);
- }
String havingInputAlias = null;
if (forHavingClause) {
@@ -2115,78 +2123,78 @@ public class CalcitePlanner extends SemanticAnalyzer {
}
subQuery.validateAndRewriteAST(inputRR, forHavingClause, havingInputAlias,
- aliasToRel.keySet());
-
- QB qbSQ = new QB(subQuery.getOuterQueryId(), subQuery.getAlias(), true);
- qbSQ.setSubQueryDef(subQuery.getSubQuery());
- Phase1Ctx ctx_1 = initPhase1Ctx();
- doPhase1(subQuery.getSubQueryAST(), qbSQ, ctx_1, null);
- getMetaData(qbSQ);
- RelNode subQueryRelNode = genLogicalPlan(qbSQ, false);
- aliasToRel.put(subQuery.getAlias(), subQueryRelNode);
- RowResolver sqRR = relToHiveRR.get(subQueryRelNode);
-
- /*
- * Check.5.h :: For In and Not In the SubQuery must implicitly or
- * explicitly only contain one select item.
- */
- if (subQuery.getOperator().getType() != SubQueryType.EXISTS
- && subQuery.getOperator().getType() != SubQueryType.NOT_EXISTS
- && sqRR.getColumnInfos().size() - subQuery.getNumOfCorrelationExprsAddedToSQSelect() > 1) {
- throw new SemanticException(ErrorMsg.INVALID_SUBQUERY_EXPRESSION.getMsg(subQueryAST,
- "SubQuery can contain only 1 item in Select List."));
- }
-
- /*
- * If this is a Not In SubQuery Predicate then Join in the Null Check
- * SubQuery. See QBSubQuery.NotInCheck for details on why and how this
- * is constructed.
- */
- if (subQuery.getNotInCheck() != null) {
- QBSubQuery.NotInCheck notInCheck = subQuery.getNotInCheck();
- notInCheck.setSQRR(sqRR);
- QB qbSQ_nic = new QB(subQuery.getOuterQueryId(), notInCheck.getAlias(), true);
- qbSQ_nic.setSubQueryDef(notInCheck.getSubQuery());
- ctx_1 = initPhase1Ctx();
- doPhase1(notInCheck.getSubQueryAST(), qbSQ_nic, ctx_1, null);
- getMetaData(qbSQ_nic);
- RelNode subQueryNICRelNode = genLogicalPlan(qbSQ_nic, false);
- aliasToRel.put(notInCheck.getAlias(), subQueryNICRelNode);
- srcRel = genJoinRelNode(srcRel, subQueryNICRelNode,
- // set explicitly to inner until we figure out SemiJoin use
- // notInCheck.getJoinType(),
- JoinType.INNER, notInCheck.getJoinConditionAST());
- inputRR = relToHiveRR.get(srcRel);
- if (forHavingClause) {
- aliasToRel.put(havingInputAlias, srcRel);
- }
- }
-
- /*
- * Gen Join between outer Operator and SQ op
- */
- subQuery.buildJoinCondition(inputRR, sqRR, forHavingClause, havingInputAlias);
- srcRel = genJoinRelNode(srcRel, subQueryRelNode, subQuery.getJoinType(),
- subQuery.getJoinConditionAST());
- searchCond = subQuery.updateOuterQueryFilter(clonedSearchCond);
-
- srcRel = genFilterRelNode(searchCond, srcRel, forHavingClause);
-
- /*
- * For Not Exists and Not In, add a projection on top of the Left
- * Outer Join.
- */
- if (subQuery.getOperator().getType() != SubQueryType.NOT_EXISTS
- || subQuery.getOperator().getType() != SubQueryType.NOT_IN) {
- srcRel = projectLeftOuterSide(srcRel, numSrcColumns);
+ aliasToRel.keySet());
+
+ // Missing Check: Check.5.h :: For In and Not In the SubQuery must implicitly or
+ // explicitly only contain one select item.
+ }
+ }
+ private boolean genSubQueryRelNode(QB qb, ASTNode node, RelNode srcRel, boolean forHavingClause,
+ Map<ASTNode, RelNode> subQueryToRelNode,
+ Map<String, RelNode> aliasToRel) throws SemanticException {
+
+ //disallow subqueries which HIVE doesn't currently support
+ subqueryRestritionCheck(qb, node, srcRel, forHavingClause, aliasToRel);
+ Deque<ASTNode> stack = new ArrayDeque<ASTNode>();
+ stack.push(node);
+
+ boolean isSubQuery = false;
+
+ while (!stack.isEmpty()) {
+ ASTNode next = stack.pop();
+
+ switch(next.getType()) {
+ case HiveParser.TOK_SUBQUERY_EXPR:
+ String sbQueryAlias = "sq_" + qb.incrNumSubQueryPredicates();
+ QB qbSQ = new QB(qb.getId(), sbQueryAlias, true);
+ Phase1Ctx ctx1 = initPhase1Ctx();
+ doPhase1((ASTNode)next.getChild(1), qbSQ, ctx1, null);
+ getMetaData(qbSQ);
+ subqueryId++;
+ RelNode subQueryRelNode = genLogicalPlan(qbSQ, false, relToHiveColNameCalcitePosMap.get(srcRel),
+ relToHiveRR.get(srcRel));
+ subQueryToRelNode.put(next, subQueryRelNode);
+ isSubQuery = true;
+ break;
+ default:
+ int childCount = next.getChildCount();
+ for(int i = childCount - 1; i >= 0; i--) {
+ stack.push((ASTNode) next.getChild(i));
+ }
}
- }
- relToHiveRR.put(srcRel, outerQBRR);
- relToHiveColNameCalcitePosMap.put(srcRel, outerQBPosMap);
- return srcRel;
}
-
- return genFilterRelNode(searchCond, srcRel, forHavingClause);
+ return isSubQuery;
+ }
+ private RelNode genFilterRelNode(QB qb, ASTNode searchCond, RelNode srcRel,
+ Map<String, RelNode> aliasToRel, ImmutableMap<String, Integer> outerNameToPosMap,
+ RowResolver outerRR, boolean forHavingClause) throws SemanticException {
+
+ Map<ASTNode, RelNode> subQueryToRelNode = new HashMap<>();
+ boolean isSubQuery = genSubQueryRelNode(qb, searchCond, srcRel, forHavingClause,
+ subQueryToRelNode, aliasToRel);
+ if(isSubQuery) {
+ ExprNodeDesc subQueryExpr = genExprNodeDesc(searchCond, relToHiveRR.get(srcRel),
+ outerRR, subQueryToRelNode, forHavingClause);
+
+ ImmutableMap<String, Integer> hiveColNameCalcitePosMap = this.relToHiveColNameCalcitePosMap
+ .get(srcRel);
+ RexNode convertedFilterLHS = new RexNodeConverter(cluster, srcRel.getRowType(),
+ outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel),
+ outerRR, 0, true, subqueryId).convert(subQueryExpr);
+
+ RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
+ srcRel, convertedFilterLHS);
+
+ this.relToHiveColNameCalcitePosMap.put(filterRel, this.relToHiveColNameCalcitePosMap
+ .get(srcRel));
+ relToHiveRR.put(filterRel, relToHiveRR.get(srcRel));
+
+ // semi-join opt doesn't work with subqueries
+ conf.setBoolVar(ConfVars.SEMIJOIN_CONVERSION, false);
+ return filterRel;
+ } else {
+ return genFilterRelNode(searchCond, srcRel, outerNameToPosMap, outerRR, forHavingClause);
+ }
}
private RelNode projectLeftOuterSide(RelNode srcRel, int numColumns) throws SemanticException {
@@ -2213,14 +2221,15 @@ public class CalcitePlanner extends SemanticAnalyzer {
}
private RelNode genFilterLogicalPlan(QB qb, RelNode srcRel, Map<String, RelNode> aliasToRel,
- boolean forHavingClause) throws SemanticException {
+ ImmutableMap<String, Integer> outerNameToPosMap, RowResolver outerRR,
+ boolean forHavingClause) throws SemanticException {
RelNode filterRel = null;
Iterator<ASTNode> whereClauseIterator = getQBParseInfo(qb).getDestToWhereExpr().values()
.iterator();
if (whereClauseIterator.hasNext()) {
filterRel = genFilterRelNode(qb, (ASTNode) whereClauseIterator.next().getChild(0), srcRel,
- aliasToRel, forHavingClause);
+ aliasToRel, outerNameToPosMap, outerRR, forHavingClause);
}
return filterRel;
@@ -3521,7 +3530,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
private RelNode genLogicalPlan(QBExpr qbexpr) throws SemanticException {
switch (qbexpr.getOpcode()) {
case NULLOP:
- return genLogicalPlan(qbexpr.getQB(), false);
+ return genLogicalPlan(qbexpr.getQB(), false, null, null);
case UNION:
case INTERSECT:
case INTERSECTALL:
@@ -3536,7 +3545,9 @@ public class CalcitePlanner extends SemanticAnalyzer {
}
}
- private RelNode genLogicalPlan(QB qb, boolean outerMostQB) throws SemanticException {
+ private RelNode genLogicalPlan(QB qb, boolean outerMostQB,
+ ImmutableMap<String, Integer> outerNameToPosMap,
+ RowResolver outerRR) throws SemanticException {
RelNode srcRel = null;
RelNode filterRel = null;
RelNode gbRel = null;
@@ -3609,7 +3620,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
}
// 2. Build Rel for where Clause
- filterRel = genFilterLogicalPlan(qb, srcRel, aliasToRel, false);
+ filterRel = genFilterLogicalPlan(qb, srcRel, aliasToRel, outerNameToPosMap, outerRR, false);
srcRel = (filterRel == null) ? srcRel : filterRel;
RelNode starSrcRel = srcRel;
@@ -3717,7 +3728,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
// Special handling of grouping function
targetNode = rewriteGroupingFunctionAST(getGroupByForClause(qbp, destClauseName), targetNode);
}
- gbFilter = genFilterRelNode(qb, targetNode, srcRel, aliasToRel, true);
+ gbFilter = genFilterRelNode(qb, targetNode, srcRel, aliasToRel, null, null, true);
}
return gbFilter;
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
index 3458fb6..21ddae6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
@@ -538,16 +538,6 @@ public class QBSubQuery implements ISubQueryJoinInfo {
}
if ( sharedAlias != null) {
ASTNode whereClause = SubQueryUtils.subQueryWhere(insertClause);
-
- if ( whereClause != null ) {
- ASTNode u = SubQueryUtils.hasUnQualifiedColumnReferences(whereClause);
- if ( u != null ) {
- subQueryAST.setOrigin(originalSQASTOrigin);
- throw new SemanticException(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(
- u, "SubQuery cannot use the table alias: " + sharedAlias + "; " +
- "this is also an alias in the Outer Query and SubQuery contains a unqualified column reference"));
- }
- }
}
/*
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 1f6dfdd..a861263 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -51,6 +51,7 @@ import org.antlr.runtime.tree.TreeVisitor;
import org.antlr.runtime.tree.TreeVisitorAction;
import org.antlr.runtime.tree.TreeWizard;
import org.antlr.runtime.tree.TreeWizard.ContextVisitor;
+import org.apache.calcite.rel.RelNode;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.hadoop.fs.FSDataOutputStream;
@@ -11325,6 +11326,17 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
return genExprNodeDesc(expr, input, true, false);
}
+ public ExprNodeDesc genExprNodeDesc(ASTNode expr, RowResolver input,
+ RowResolver outerRR, Map<ASTNode, RelNode> subqueryToRelNode,
+ boolean useCaching) throws SemanticException {
+
+ TypeCheckCtx tcCtx = new TypeCheckCtx(input, useCaching, false);
+ tcCtx.setOuterRR(outerRR);
+ tcCtx.setSubqueryToRelNode(subqueryToRelNode);
+ return genExprNodeDesc(expr, input, tcCtx);
+ }
+
+
public ExprNodeDesc genExprNodeDesc(ASTNode expr, RowResolver input, boolean useCaching)
throws SemanticException {
return genExprNodeDesc(expr, input, useCaching, false);
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckCtx.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckCtx.java
index 02896ff..9753f9e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckCtx.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckCtx.java
@@ -18,9 +18,11 @@
package org.apache.hadoop.hive.ql.parse;
+import org.apache.calcite.rel.RelNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import java.util.Map;
/**
* This class implements the context information that is used for typechecking
@@ -35,6 +37,19 @@ public class TypeCheckCtx implements NodeProcessorCtx {
*/
private RowResolver inputRR;
+ /**
+ * RowResolver of outer query. This is used to resolve co-rrelated columns in Filter
+ * TODO:
+ * this currently will only be able to resolve reference to parent query's column
+ * this will not work for references to grand-parent column
+ */
+ private RowResolver outerRR;
+
+ /**
+ * Map from astnode of a subquery to it's logical plan.
+ */
+ private Map<ASTNode, RelNode> subqueryToRelNode;
+
private final boolean useCaching;
private final boolean foldExpr;
@@ -50,7 +65,7 @@ public class TypeCheckCtx implements NodeProcessorCtx {
private String error;
/**
- * The node that generated the potential typecheck error
+ * The node that generated the potential typecheck error.
*/
private ASTNode errorSrcNode;
@@ -104,6 +119,8 @@ public class TypeCheckCtx implements NodeProcessorCtx {
this.allowWindowing = allowWindowing;
this.allowIndexExpr = allowIndexExpr;
this.allowSubQueryExpr = allowSubQueryExpr;
+ this.outerRR = null;
+ this.subqueryToRelNode = null;
}
/**
@@ -122,6 +139,36 @@ public class TypeCheckCtx implements NodeProcessorCtx {
}
/**
+ * @param outerRR
+ * the outerRR to set
+ */
+ public void setOuterRR(RowResolver outerRR) {
+ this.outerRR = outerRR;
+ }
+
+ /**
+ * @return the outerRR
+ */
+ public RowResolver getOuterRR() {
+ return outerRR;
+ }
+
+ /**
+ * @param subqueryToRelNode
+ * the subqueryToRelNode to set
+ */
+ public void setSubqueryToRelNode(Map<ASTNode, RelNode> subqueryToRelNode) {
+ this.subqueryToRelNode = subqueryToRelNode;
+ }
+
+ /**
+ * @return the outerRR
+ */
+ public Map<ASTNode, RelNode> getSubqueryToRelNode() {
+ return subqueryToRelNode;
+ }
+
+ /**
* @param unparseTranslator
* the unparseTranslator to set
*/
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
index ace3eaf..6c30efd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
@@ -30,6 +30,7 @@ import java.util.List;
import java.util.Map;
import java.util.Stack;
+import org.apache.calcite.rel.RelNode;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.hadoop.hive.common.type.HiveChar;
@@ -43,7 +44,6 @@ import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
-import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
@@ -52,6 +52,7 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.lib.ExpressionWalker;
import org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
@@ -60,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeSubQueryDesc;
import org.apache.hadoop.hive.ql.udf.SettableUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare;
@@ -134,7 +136,10 @@ public class TypeCheckProcFactory {
ASTNode expr = (ASTNode) nd;
TypeCheckCtx ctx = (TypeCheckCtx) procCtx;
- if (!ctx.isUseCaching()) {
+ // bypass only if outerRR is not null. Otherwise we need to look for expressions in outerRR for
+ // subqueries e.g. select min(b.value) from table b group by b.key
+ // having key in (select .. where a = min(b.value)
+ if (!ctx.isUseCaching() && ctx.getOuterRR() == null) {
return null;
}
@@ -147,6 +152,12 @@ public class TypeCheckProcFactory {
// If the current subExpression is pre-calculated, as in Group-By etc.
ColumnInfo colInfo = input.getExpression(expr);
+
+ // try outer row resolver
+ RowResolver outerRR = ctx.getOuterRR();
+ if(colInfo == null && outerRR != null) {
+ colInfo = outerRR.getExpression(expr);
+ }
if (colInfo != null) {
desc = new ExprNodeColumnDesc(colInfo);
ASTNode source = input.getExpressionSource(expr);
@@ -201,14 +212,14 @@ public class TypeCheckProcFactory {
+ HiveParser.TOK_INTERVAL_SECOND_LITERAL + "%"), tf.getIntervalExprProcessor());
opRules.put(new RuleRegExp("R7", HiveParser.TOK_TABLE_OR_COL + "%"),
tf.getColumnExprProcessor());
- opRules.put(new RuleRegExp("R8", HiveParser.TOK_SUBQUERY_OP + "%"),
+ opRules.put(new RuleRegExp("R8", HiveParser.TOK_SUBQUERY_EXPR + "%"),
tf.getSubQueryExprProcessor());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(tf.getDefaultExprProcessor(),
opRules, tcCtx);
- GraphWalker ogw = new DefaultGraphWalker(disp);
+ GraphWalker ogw = new ExpressionWalker(disp);
// Create a list of top nodes
ArrayList<Node> topNodes = Lists.<Node>newArrayList(expr);
@@ -618,6 +629,13 @@ public class TypeCheckProcFactory {
boolean isTableAlias = input.hasTableAlias(tableOrCol);
ColumnInfo colInfo = input.get(null, tableOrCol);
+ // try outer row resolver
+ if(ctx.getOuterRR() != null && colInfo == null && !isTableAlias) {
+ RowResolver outerRR = ctx.getOuterRR();
+ isTableAlias = outerRR.hasTableAlias(tableOrCol);
+ colInfo = outerRR.get(null, tableOrCol);
+ }
+
if (isTableAlias) {
if (colInfo != null) {
if (parent != null && parent.getType() == HiveParser.DOT) {
@@ -1158,6 +1176,12 @@ public class TypeCheckProcFactory {
}
ColumnInfo colInfo = input.get(tableAlias, colName);
+ // Try outer Row resolver
+ if(colInfo == null && ctx.getOuterRR() != null) {
+ RowResolver outerRR = ctx.getOuterRR();
+ colInfo = outerRR.get(tableAlias, colName);
+ }
+
if (colInfo == null) {
ctx.setError(ErrorMsg.INVALID_COLUMN.getMsg(expr.getChild(1)), expr);
return null;
@@ -1222,6 +1246,10 @@ public class TypeCheckProcFactory {
return null;
}
+ if(expr.getType() == HiveParser.TOK_SUBQUERY_OP || expr.getType() == HiveParser.TOK_QUERY) {
+ return null;
+ }
+
if (expr.getType() == HiveParser.TOK_TABNAME) {
return null;
}
@@ -1379,11 +1407,47 @@ public class TypeCheckProcFactory {
return desc;
}
+ //TOK_SUBQUERY_EXPR should have either 2 or 3 children
+ assert(expr.getChildren().size() == 3 || expr.getChildren().size() == 2);
+ //First child should be operand
+ assert(expr.getChild(0).getType() == HiveParser.TOK_SUBQUERY_OP);
+
+ ASTNode subqueryOp = (ASTNode) expr.getChild(0);
+
+ boolean isIN = (subqueryOp.getChild(0).getType() == HiveParser.KW_IN
+ || subqueryOp.getChild(0).getType() == HiveParser.TOK_SUBQUERY_OP_NOTIN);
+ boolean isEXISTS = (subqueryOp.getChild(0).getType() == HiveParser.KW_EXISTS
+ || subqueryOp.getChild(0).getType() == HiveParser.TOK_SUBQUERY_OP_NOTEXISTS);
+
+ // subqueryToRelNode might be null if subquery expression anywhere other than
+ // as expected in filter (where/having). We should throw an appropriate error
+ // message
+
+ Map<ASTNode, RelNode> subqueryToRelNode = ctx.getSubqueryToRelNode();
+ if(subqueryToRelNode == null) {
+ throw new SemanticException(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(
+ " Currently SubQuery expressions are only allowed as " +
+ "Where and Having Clause predicates"));
+ }
+
+ //For now because subquery is only supported in filter
+ // we will create subquery expression of boolean type
+ if(isEXISTS) {
+ return new ExprNodeSubQueryDesc(TypeInfoFactory.booleanTypeInfo, subqueryToRelNode.get(expr),
+ ExprNodeSubQueryDesc.SubqueryType.EXISTS);
+ }
+ if(isIN) {
+ assert(nodeOutputs[2] != null);
+ ExprNodeDesc lhs = (ExprNodeDesc)nodeOutputs[2];
+ return new ExprNodeSubQueryDesc(TypeInfoFactory.booleanTypeInfo, subqueryToRelNode.get(expr),
+ ExprNodeSubQueryDesc.SubqueryType.IN, lhs);
+ }
+
/*
* Restriction.1.h :: SubQueries only supported in the SQL Where Clause.
*/
ctx.setError(ErrorMsg.UNSUPPORTED_SUBQUERY_EXPRESSION.getMsg(sqNode,
- "Currently SubQuery expressions are only allowed as Where Clause predicates"),
+ "Currently only IN & EXISTS SubQuery expressions are allowed"),
sqNode);
return null;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeSubQueryDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeSubQueryDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeSubQueryDesc.java
new file mode 100644
index 0000000..aec331b
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeSubQueryDesc.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.plan;
+
+import java.io.Serializable;
+
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.calcite.rel.RelNode;
+
+/**
+ * This encapsulate subquery expression which consists of
+ * Relnode for subquery.
+ * type (IN, EXISTS )
+ * LHS operand
+ */
+public class ExprNodeSubQueryDesc extends ExprNodeDesc implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ public static enum SubqueryType{
+ IN,
+ EXISTS,
+ };
+
+ public static final int IN=1;
+ public static final int EXISTS=2;
+
+ /**
+ * RexNode corresponding to subquery.
+ */
+ private RelNode rexSubQuery;
+ private ExprNodeDesc subQueryLhs;
+ private SubqueryType type;
+
+ public ExprNodeSubQueryDesc(TypeInfo typeInfo, RelNode subQuery, SubqueryType type) {
+ super(typeInfo);
+ this.rexSubQuery = subQuery;
+ this.subQueryLhs = null;
+ this.type = type;
+ }
+ public ExprNodeSubQueryDesc(TypeInfo typeInfo, RelNode subQuery,
+ SubqueryType type, ExprNodeDesc lhs) {
+ super(typeInfo);
+ this.rexSubQuery = subQuery;
+ this.subQueryLhs = lhs;
+ this.type = type;
+
+ }
+
+ public SubqueryType getType() {
+ return type;
+ }
+
+ public ExprNodeDesc getSubQueryLhs() {
+ return subQueryLhs;
+ }
+
+ public RelNode getRexSubQuery() {
+ return rexSubQuery;
+ }
+
+ @Override
+ public ExprNodeDesc clone() {
+ return new ExprNodeSubQueryDesc(typeInfo, rexSubQuery, type, subQueryLhs);
+ }
+
+ @Override
+ public boolean isSame(Object o) {
+ if (!(o instanceof ExprNodeSubQueryDesc)) {
+ return false;
+ }
+ ExprNodeSubQueryDesc dest = (ExprNodeSubQueryDesc) o;
+ if (subQueryLhs != null && dest.getSubQueryLhs() != null) {
+ if (!subQueryLhs.equals(dest.getSubQueryLhs())) {
+ return false;
+ }
+ }
+ if (!typeInfo.equals(dest.getTypeInfo())) {
+ return false;
+ }
+ if (!rexSubQuery.equals(dest.getRexSubQuery())) {
+ return false;
+ }
+ if(type != dest.getType()) {
+ return false;
+ }
+ return true;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientnegative/subquery_corr_from.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/subquery_corr_from.q b/ql/src/test/queries/clientnegative/subquery_corr_from.q
new file mode 100644
index 0000000..7df52c9
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/subquery_corr_from.q
@@ -0,0 +1,3 @@
+-- corr var are only supported in where/having clause
+
+select * from part po where p_size IN (select p_size from (select p_size, p_type from part pp where pp.p_brand = po.p_brand) p where p.p_type=po.p_type) ;
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientnegative/subquery_corr_grandparent.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/subquery_corr_grandparent.q b/ql/src/test/queries/clientnegative/subquery_corr_grandparent.q
new file mode 100644
index 0000000..3519c5b
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/subquery_corr_grandparent.q
@@ -0,0 +1,5 @@
+-- inner query can only refer to it's parent query columns
+select *
+from part x
+where x.p_name in (select y.p_name from part y where exists (select z.p_name from part z where x.p_name = z.p_name))
+;
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientnegative/subquery_corr_select.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/subquery_corr_select.q b/ql/src/test/queries/clientnegative/subquery_corr_select.q
new file mode 100644
index 0000000..a47cee1
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/subquery_corr_select.q
@@ -0,0 +1,2 @@
+-- correlated var is only allowed in where/having
+explain select * from part po where p_size IN (select po.p_size from part p where p.p_type=po.p_type) ;
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientnegative/subquery_restrictions.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/subquery_restrictions.q b/ql/src/test/queries/clientnegative/subquery_restrictions.q
new file mode 100644
index 0000000..80870d7
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/subquery_restrictions.q
@@ -0,0 +1,92 @@
+--Restriction.1.h SubQueries only supported in the SQL Where Clause.
+select src.key in (select key from src s1 where s1.key > '9')
+from src;
+
+select count(*)
+from src
+group by src.key in (select key from src s1 where s1.key > '9') ;
+
+--Restriction.2.h The subquery can only be the RHS of an expression
+----curently paser doesn't allow such queries
+--select * from part where (select p_size from part) IN (1,2);
+
+--Restriction.3.m The predicate operators supported are In, Not In, exists and Not exists.
+----select * from part where p_brand > (select key from src);
+
+--Check.4.h For Exists and Not Exists, the Sub Query must have 1 or more correlated predicates.
+select * from src where exists (select * from part);
+
+--Check.5.h multiple columns in subquery select
+select * from src where src.key in (select * from src s1 where s1.key > '9');
+
+--Restriction.6.m The LHS in a SubQuery must have all its Column References be qualified
+--This is not restriction anymore
+
+--Restriction 7.h subquery with or condition
+select count(*)
+from src
+where src.key in (select key from src s1 where s1.key > '9') or src.value is not null
+;
+
+--Restriction.8.m We allow only 1 SubQuery expression per Query
+select * from part where p_size IN (select p_size from part) AND p_brand IN (select p_brand from part);
+
+--Restriction 9.m nested subquery
+select *
+from part x
+where x.p_name in (select y.p_name from part y where exists (select z.p_name from part z where y.p_name = z.p_name))
+;
+
+--Restriction.10.h In a SubQuery references to Parent Query columns is only supported in the where clause.
+select * from part where p_size in (select p.p_size + part.p_size from part p);
+select * from part where part.p_size IN (select min(p_size) from part p group by part.p_type);
+
+
+--Restriction.11.m A SubQuery predicate that refers to a Parent Query column must be a valid Join predicate
+select * from part where p_size in (select p_size from part p where p.p_type > part.p_type);
+select * from part where part.p_size IN (select min(p_size) from part p where NOT(part.p_type = p.p_type));
+
+
+--Check.12.h SubQuery predicates cannot only refer to Parent Query columns
+select * from part where p_name IN (select p_name from part p where part.p_type <> 1);
+
+--Restriction.13.m In the case of an implied Group By on a correlated Sub- Query, the SubQuery always returns 1 row. For e.g. a count on an empty set is 0, while all other UDAFs return null. Converting such a SubQuery into a Join by computing all Groups in one shot, changes the semantics: the Group By SubQuery output will not contain rows for Groups that don\u2019t exist.
+select *
+from src b
+where exists
+ (select count(*)
+ from src a
+ where b.value = a.value and a.key = b.key and a.value > 'val_9'
+ )
+;
+
+--Restriction.14.h Correlated Sub Queries cannot contain Windowing clauses.
+select p_mfgr, p_name, p_size
+from part a
+where a.p_size in
+ (select first_value(p_size) over(partition by p_mfgr order by p_size)
+ from part b
+ where a.p_brand = b.p_brand)
+;
+
+--Restriction 15.h all unqualified column references in a SubQuery will resolve to table sources within the SubQuery.
+select *
+from src
+where src.key in (select key from src where key > '9')
+;
+
+----------------------------------------------------------------
+-- Following tests does not fall under any restrictions per-se, they just currently don't work with HIVE
+----------------------------------------------------------------
+
+-- correlated var which refers to outer query join table
+explain select p.p_partkey, li.l_suppkey from (select distinct l_partkey as p_partkey from lineitem) p join lineitem li on p.p_partkey = li.l_partkey where li.l_linenumber = 1 and li.l_orderkey in (select l_orderkey from lineitem where l_shipmode = 'AIR' and l_partkey = p.l_partkey) ;
+
+-- union, not in, corr
+explain select * from part where p_name NOT IN (select p_name from part p where p.p_mfgr = part.p_comment UNION ALL select p_brand from part);
+
+-- union, not in, corr, cor var in both queries
+explain select * from part where p_name NOT IN (select p_name from part p where p.p_mfgr = part.p_comment UNION ALL select p_brand from part pp where pp.p_mfgr = part.p_comment);
+
+-- IN, union, corr
+explain select * from part where p_name IN (select p_name from part p where p.p_mfgr = part.p_comment UNION ALL select p_brand from part);
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q b/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
index cbfb5d5..8936073 100644
--- a/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
+++ b/ql/src/test/queries/clientpositive/cbo_rp_auto_join1.q
@@ -1,5 +1,6 @@
set hive.cbo.returnpath.hiveop=true;
set hive.stats.fetch.column.stats=true;
+set hive.enable.semijoin.conversion=true;
;
set hive.exec.reducers.max = 1;
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientpositive/join31.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/join31.q b/ql/src/test/queries/clientpositive/join31.q
index c79105f..aa17b4d 100644
--- a/ql/src/test/queries/clientpositive/join31.q
+++ b/ql/src/test/queries/clientpositive/join31.q
@@ -1,4 +1,5 @@
set hive.mapred.mode=nonstrict;
+set hive.enable.semijoin.conversion=true;
-- SORT_QUERY_RESULTS
CREATE TABLE dest_j1(key STRING, cnt INT);
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientpositive/multiMapJoin2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/multiMapJoin2.q b/ql/src/test/queries/clientpositive/multiMapJoin2.q
index cf5dbb0..c66dc66 100644
--- a/ql/src/test/queries/clientpositive/multiMapJoin2.q
+++ b/ql/src/test/queries/clientpositive/multiMapJoin2.q
@@ -3,6 +3,7 @@ set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecutePrinter,org.
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=6000;
+set hive.enable.semijoin.conversion=true;
-- we will generate one MR job.
EXPLAIN
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientpositive/subquery_in.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/subquery_in.q b/ql/src/test/queries/clientpositive/subquery_in.q
index c01ae70..5b22dce 100644
--- a/ql/src/test/queries/clientpositive/subquery_in.q
+++ b/ql/src/test/queries/clientpositive/subquery_in.q
@@ -1,5 +1,6 @@
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
+
-- SORT_QUERY_RESULTS
-- non agg, non corr
@@ -118,3 +119,104 @@ from (select distinct l_partkey as p_partkey from lineitem) p join lineitem li o
where li.l_linenumber = 1 and
li.l_orderkey in (select l_orderkey from lineitem where l_shipmode = 'AIR' and l_linenumber = li.l_linenumber)
;
+
+
+--where has multiple conjuction
+explain select * from part where p_brand <> 'Brand#14' AND p_size IN (select min(p_size) from part p where p.p_type = part.p_type group by p_type) AND p_size <> 340;
+select * from part where p_brand <> 'Brand#14' AND p_size IN (select min(p_size) from part p where p.p_type = part.p_type group by p_type) AND p_size <> 340;
+
+--lhs contains non-simple expression
+explain select * from part where (p_size-1) IN (select min(p_size) from part group by p_type);
+select * from part where (p_size-1) IN (select min(p_size) from part group by p_type);
+
+explain select * from part where (p_partkey*p_size) IN (select min(p_partkey) from part group by p_type);
+select * from part where (p_partkey*p_size) IN (select min(p_partkey) from part group by p_type);
+
+--lhs contains non-simple expression, corr
+explain select count(*) as c from part as e where p_size + 100 IN (select p_partkey from part where p_name = e.p_name);
+select count(*) as c from part as e where p_size + 100 IN (select p_partkey from part where p_name = e.p_name);
+
+-- lhs contains udf expression
+explain select * from part where floor(p_retailprice) IN (select floor(min(p_retailprice)) from part group by p_type);
+select * from part where floor(p_retailprice) IN (select floor(min(p_retailprice)) from part group by p_type);
+
+explain select * from part where p_name IN (select p_name from part p where p.p_size = part.p_size AND part.p_size + 121150 = p.p_partkey );
+select * from part where p_name IN (select p_name from part p where p.p_size = part.p_size AND part.p_size + 121150 = p.p_partkey );
+
+-- correlated query, multiple correlated variables referring to different outer var
+explain select * from part where p_name IN (select p_name from part p where p.p_size = part.p_size AND part.p_partkey= p.p_partkey );
+select * from part where p_name IN (select p_name from part p where p.p_size = part.p_size AND part.p_partkey= p.p_partkey );
+
+-- correlated var refers to outer table alias
+explain select p_name from (select p_name, p_type, p_brand as brand from part) fpart where fpart.p_type IN (select p_type from part where part.p_brand = fpart.brand);
+select p_name from (select p_name, p_type, p_brand as brand from part) fpart where fpart.p_type IN (select p_type from part where part.p_brand = fpart.brand);
+
+-- correlated var refers to outer table alias which is an expression
+explain select p_name from (select p_name, p_type, p_size+1 as size from part) fpart where fpart.p_type IN (select p_type from part where (part.p_size+1) = fpart.size);
+select p_name from (select p_name, p_type, p_size+1 as size from part) fpart where fpart.p_type IN (select p_type from part where (part.p_size+1) = fpart.size);
+
+-- where plus having
+explain select key, count(*) from src where value IN (select value from src) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+select key, count(*) from src where value IN (select value from src) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+
+-- where with having, correlated
+explain select key, count(*) from src where value IN (select value from src sc where sc.key = src.key ) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+select key, count(*) from src where value IN (select value from src sc where sc.key = src.key ) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+
+-- subquery with order by
+explain select * from part where (p_size-1) IN (select min(p_size) from part group by p_type) order by p_brand;
+select * from part where (p_size-1) IN (select min(p_size) from part group by p_type) order by p_brand;
+
+--order by with limit
+explain select * from part where (p_size-1) IN (select min(p_size) from part group by p_type) order by p_brand limit 4;
+select * from part where (p_size-1) IN (select min(p_size) from part group by p_type) order by p_brand limit 4;
+
+-- union, uncorr
+explain select * from src where key IN (select p_name from part UNION ALL select p_brand from part);
+select * from src where key IN (select p_name from part UNION ALL select p_brand from part);
+
+-- corr, subquery has another subquery in from
+explain select p_mfgr, b.p_name, p_size from part b where b.p_name in
+ (select p_name from (select p_mfgr, p_name, p_size as r from part) a where r < 10 and b.p_mfgr = a.p_mfgr ) order by p_mfgr,p_size;
+select p_mfgr, b.p_name, p_size from part b where b.p_name in
+ (select p_name from (select p_mfgr, p_name, p_size as r from part) a where r < 10 and b.p_mfgr = a.p_mfgr ) order by p_mfgr,p_size;
+
+-- join in subquery, correlated predicate with only one table
+explain select p_partkey from part where p_name in (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size);
+select p_partkey from part where p_name in (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size);
+
+-- join in subquery, correlated predicate with both inner tables, same outer var
+explain select p_partkey from part where p_name in
+ (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size and p.p_size=part.p_size);
+select p_partkey from part where p_name in
+ (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size and p.p_size=part.p_size);
+
+-- join in subquery, correlated predicate with both inner tables, different outer var
+explain select p_partkey from part where p_name in
+ (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size and p.p_type=part.p_type);
+
+-- subquery within from
+explain select p_partkey from
+ (select p_size, p_partkey from part where p_name in (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size)) subq;
+select p_partkey from
+ (select p_size, p_partkey from part where p_name in (select p.p_name from part p left outer join part pp on p.p_type = pp.p_type where pp.p_size = part.p_size)) subq;
+
+
+create table tempty(i int);
+create table tnull(i int);
+insert into tnull values(NULL) , (NULL);
+
+-- empty inner table, non-null sq key, expected empty result
+select * from part where p_size IN (select i from tempty);
+
+-- empty inner table, null sq key, expected empty result
+select * from tnull where i IN (select i from tempty);
+
+-- null inner table, non-null sq key
+select * from part where p_size IN (select i from tnull);
+
+-- null inner table, null sq key
+select * from tnull where i IN (select i from tnull);
+
+drop table tempty;
+
http://git-wip-us.apache.org/repos/asf/hive/blob/382dc208/ql/src/test/queries/clientpositive/subquery_notin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/subquery_notin.q b/ql/src/test/queries/clientpositive/subquery_notin.q
index 3f4fb7f..c29e63f 100644
--- a/ql/src/test/queries/clientpositive/subquery_notin.q
+++ b/ql/src/test/queries/clientpositive/subquery_notin.q
@@ -1,4 +1,5 @@
set hive.mapred.mode=nonstrict;
+
-- non agg, non corr
explain
select *
@@ -76,10 +77,10 @@ order by p_mfgr, p_size
;
-- non agg, non corr, Group By in Parent Query
-select li.l_partkey, count(*)
-from lineitem li
-where li.l_linenumber = 1 and
- li.l_orderkey not in (select l_orderkey from lineitem where l_shipmode = 'AIR')
+select li.l_partkey, count(*)
+from lineitem li
+where li.l_linenumber = 1 and
+ li.l_orderkey not in (select l_orderkey from lineitem where l_shipmode = 'AIR')
group by li.l_partkey
;
@@ -103,3 +104,127 @@ from T1_v where T1_v.key not in (select T2_v.key from T2_v);
select *
from T1_v where T1_v.key not in (select T2_v.key from T2_v);
+
+--where has multiple conjuction
+explain select * from part where p_brand <> 'Brand#14' AND p_size NOT IN (select min(p_size) from part p where p.p_type = part.p_type group by p_type) AND p_size <> 340;
+select * from part where p_brand <> 'Brand#14' AND p_size NOT IN (select min(p_size) from part p where p.p_type = part.p_type group by p_type) AND p_size <> 340;
+
+--lhs contains non-simple expression
+explain select * from part where (p_size-1) NOT IN (select min(p_size) from part group by p_type);
+select * from part where (p_size-1) NOT IN (select min(p_size) from part group by p_type);
+
+explain select * from part where (p_partkey*p_size) NOT IN (select min(p_partkey) from part group by p_type);
+select * from part where (p_partkey*p_size) NOT IN (select min(p_partkey) from part group by p_type);
+
+--lhs contains non-simple expression, corr
+explain select count(*) as c from part as e where p_size + 100 NOT IN (select p_partkey from part where p_name = e.p_name);
+select count(*) as c from part as e where p_size + 100 NOT IN (select p_partkey from part where p_name = e.p_name);
+
+-- lhs contains udf expression
+explain select * from part where floor(p_retailprice) NOT IN (select floor(min(p_retailprice)) from part group by p_type);
+select * from part where floor(p_retailprice) NOT IN (select floor(min(p_retailprice)) from part group by p_type);
+
+explain select * from part where p_name NOT IN (select p_name from part p where p.p_size = part.p_size AND part.p_size + 121150 = p.p_partkey );
+select * from part where p_name NOT IN (select p_name from part p where p.p_size = part.p_size AND part.p_size + 121150 = p.p_partkey );
+
+-- correlated query, multiple correlated variables referring to different outer var
+explain select * from part where p_name NOT IN (select p_name from part p where p.p_size = part.p_size AND part.p_partkey= p.p_partkey );
+select * from part where p_name NOT IN (select p_name from part p where p.p_size = part.p_size AND part.p_partkey= p.p_partkey );
+
+-- correlated var refers to outer table alias
+explain select p_name from (select p_name, p_type, p_brand as brand from part) fpart where fpart.p_type NOT IN (select p_type from part where part.p_brand = fpart.brand);
+select p_name from (select p_name, p_type, p_brand as brand from part) fpart where fpart.p_type NOT IN (select p_type from part where part.p_brand = fpart.brand);
+
+-- correlated var refers to outer table alias which is an expression
+explain select p_name from (select p_name, p_type, p_size+1 as size from part) fpart where fpart.p_type NOT IN (select p_type from part where (part.p_size+1) = fpart.size);
+select p_name from (select p_name, p_type, p_size+1 as size from part) fpart where fpart.p_type NOT IN (select p_type from part where (part.p_size+1) = fpart.size);
+
+-- where plus having
+explain select key, count(*) from src where value NOT IN (select value from src) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+select key, count(*) from src where value NOT IN (select value from src) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+
+-- where with having, correlated
+explain select key, count(*) from src where value NOT IN (select value from src sc where sc.key = src.key ) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+select key, count(*) from src where value NOT IN (select value from src sc where sc.key = src.key ) group by key having count(*) in (select count(*) from src s1 where s1.key = '90' group by s1.key );
+
+-- subquery with order by
+explain select * from part where (p_size-1) NOT IN (select min(p_size) from part group by p_type) order by p_brand;
+select * from part where (p_size-1) NOT IN (select min(p_size) from part group by p_type) order by p_brand;
+
+--order by with limit
+explain select * from part where (p_size-1) NOT IN (select min(p_size) from part group by p_type) order by p_brand limit 4;
+select * from part where (p_size-1) NOT IN (select min(p_size) from part group by p_type) order by p_brand limit 4;
+
+-- union, uncorr
+explain select * from src where key NOT IN (select p_name from part UNION ALL select p_brand from part);
+select * from src where key NOT IN (select p_name from part UNION ALL select p_brand from part);
+
+explain select count(*) as c from part as e where p_size + 100 not in ( select p_type from part where p_brand = e.p_brand);
+select count(*) as c from part as e where p_size + 100 not in ( select p_type from part where p_brand = e.p_brand);
+
+--nullability tests
+CREATE TABLE t1 (c1 INT, c2 CHAR(100));
+INSERT INTO t1 VALUES (null,null), (1,''), (2,'abcde'), (100,'abcdefghij');
+
+CREATE TABLE t2 (c1 INT);
+INSERT INTO t2 VALUES (null), (2), (100);
+
+-- uncorr
+explain SELECT c1 FROM t1 WHERE c1 NOT IN (SELECT c1 FROM t2);
+SELECT c1 FROM t1 WHERE c1 NOT IN (SELECT c1 FROM t2);
+
+-- corr
+explain SELECT c1 FROM t1 WHERE c1 NOT IN (SELECT c1 FROM t2 where t1.c2=t2.c1);
+SELECT c1 FROM t1 WHERE c1 NOT IN (SELECT c1 FROM t2 where t1.c1=t2.c1);
+
+DROP TABLE t1;
+DROP TABLE t2;
+
+-- corr, nullability, should not produce any result
+create table t1(a int, b int);
+insert into t1 values(1,0), (1,0),(1,0);
+
+create table t2(a int, b int);
+insert into t2 values(2,1), (3,1), (NULL,1);
+
+explain select t1.a from t1 where t1.b NOT IN (select t2.a from t2 where t2.b=t1.a);
+select t1.a from t1 where t1.b NOT IN (select t2.a from t2 where t2.b=t1.a);
+drop table t1;
+drop table t2;
+
+
+-- coor, nullability, should produce result
+create table t7(i int, j int);
+insert into t7 values(null, 5), (4, 15);
+
+create table fixOb(i int, j int);
+insert into fixOb values(-1, 5), (-1, 15);
+
+explain select * from fixOb where j NOT IN (select i from t7 where t7.j=fixOb.j);
+select * from fixOb where j NOT IN (select i from t7 where t7.j=fixOb.j);
+
+drop table t7;
+drop table fixOb;
+
+create table t(i int, j int);
+insert into t values(1,2), (4,5), (7, NULL);
+
+
+-- case with empty inner result (t1.j=t.j=NULL) and null subquery key(t.j = NULL)
+explain select t.i from t where t.j NOT IN (select t1.i from t t1 where t1.j=t.j);
+select t.i from t where t.j NOT IN (select t1.i from t t1 where t1.j=t.j);
+
+-- case with empty inner result (t1.j=t.j=NULL) and non-null subquery key(t.i is never null)
+explain select t.i from t where t.i NOT IN (select t1.i from t t1 where t1.j=t.j);
+select t.i from t where t.i NOT IN (select t1.i from t t1 where t1.j=t.j);
+
+-- case with non-empty inner result and null subquery key(t.j is null)
+explain select t.i from t where t.j NOT IN (select t1.i from t t1 );
+select t.i from t where t.j NOT IN (select t1.i from t t1 );
+
+-- case with non-empty inner result and non-null subquery key(t.i is never null)
+explain select t.i from t where t.i NOT IN (select t1.i from t t1 );
+select t.i from t where t.i NOT IN (select t1.i from t t1 );
+
+drop table t1;
+