You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2017/05/10 05:09:23 UTC
[6/6] hive git commit: HIVE-16330 : Improve plans for scalar subquery
with aggregates (Vineet Garg via Ashutosh Chauhan)
HIVE-16330 : Improve plans for scalar subquery with aggregates (Vineet Garg via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a113ede9
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a113ede9
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a113ede9
Branch: refs/heads/master
Commit: a113ede99e82a11384a144616938334696032dd8
Parents: 4ba48aa
Author: Vineet Garg <vg...@hortonworks.com>
Authored: Tue May 9 21:07:42 2017 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Tue May 9 22:06:52 2017 -0700
----------------------------------------------------------------------
.../optimizer/calcite/HivePlannerContext.java | 12 +-
.../calcite/rules/HiveSubQueryRemoveRule.java | 59 +-
.../hadoop/hive/ql/parse/CalcitePlanner.java | 25 +-
.../apache/hadoop/hive/ql/parse/QBSubQuery.java | 62 +-
.../calcite/TestCBORuleFiredOnlyOnce.java | 3 +-
.../subquery_scalar_corr_multi_rows.q | 3 +-
.../subquery_scalar_corr_multi_rows.q.out | 3 +-
.../clientpositive/llap/subquery_scalar.q.out | 2423 +++------------
.../clientpositive/llap/subquery_select.q.out | 2784 +++---------------
.../results/clientpositive/perf/query1.q.out | 240 +-
.../results/clientpositive/perf/query30.q.out | 335 +--
.../results/clientpositive/perf/query6.q.out | 349 +--
.../results/clientpositive/perf/query81.q.out | 335 +--
.../results/clientpositive/perf/query9.q.out | 1019 ++-----
14 files changed, 1859 insertions(+), 5793 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java
index d0b1757..bdf9955 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java
@@ -22,6 +22,7 @@ import org.apache.calcite.plan.Context;
import org.apache.calcite.rel.RelNode;
import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveAlgorithmsConf;
import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry;
+
import java.util.Set;
@@ -29,17 +30,18 @@ public class HivePlannerContext implements Context {
private HiveAlgorithmsConf algoConfig;
private HiveRulesRegistry registry;
private CalciteConnectionConfig calciteConfig;
- private Set<RelNode> corrScalarRexSQWithAgg;
+ private SubqueryConf subqueryConfig;
public HivePlannerContext(HiveAlgorithmsConf algoConfig, HiveRulesRegistry registry,
- CalciteConnectionConfig calciteConfig, Set<RelNode> corrScalarRexSQWithAgg) {
+ CalciteConnectionConfig calciteConfig, Set<RelNode> corrScalarRexSQWithAgg,
+ Set<RelNode> scalarAggNoGbyWindowing) {
this.algoConfig = algoConfig;
this.registry = registry;
this.calciteConfig = calciteConfig;
// this is to keep track if a subquery is correlated and contains aggregate
// this is computed in CalcitePlanner while planning and is later required by subuery remove rule
// hence this is passed using HivePlannerContext
- this.corrScalarRexSQWithAgg = corrScalarRexSQWithAgg;
+ this.subqueryConfig = new SubqueryConf(corrScalarRexSQWithAgg, scalarAggNoGbyWindowing);
}
public <T> T unwrap(Class<T> clazz) {
@@ -52,8 +54,8 @@ public class HivePlannerContext implements Context {
if (clazz.isInstance(calciteConfig)) {
return clazz.cast(calciteConfig);
}
- if(clazz.isInstance(corrScalarRexSQWithAgg)) {
- return clazz.cast(corrScalarRexSQWithAgg);
+ if(clazz.isInstance(subqueryConfig)) {
+ return clazz.cast(subqueryConfig);
}
return null;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
index c692cc0..83d3f74 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java
@@ -56,6 +56,7 @@ import java.util.Set;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveSubQRemoveRelBuilder;
+import org.apache.hadoop.hive.ql.optimizer.calcite.SubqueryConf;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter;
/**
@@ -99,11 +100,12 @@ public abstract class HiveSubQueryRemoveRule extends RelOptRule{
final int fieldCount = builder.peek().getRowType().getFieldCount();
assert(filter instanceof HiveFilter);
- Set<RelNode> corrScalarQueries = filter.getCluster().getPlanner().getContext().unwrap(Set.class);
- boolean isCorrScalarQuery = corrScalarQueries.contains(e.rel);
+ SubqueryConf subqueryConfig = filter.getCluster().getPlanner().getContext().unwrap(SubqueryConf.class);
+ boolean isCorrScalarQuery = subqueryConfig.getCorrScalarRexSQWithAgg().contains(e.rel);
+ boolean hasNoWindowingAndNoGby = subqueryConfig.getScalarAggWithoutGbyWindowing().contains(e.rel);
final RexNode target = apply(e, HiveFilter.getVariablesSet(e), logic,
- builder, 1, fieldCount, isCorrScalarQuery);
+ builder, 1, fieldCount, isCorrScalarQuery, hasNoWindowingAndNoGby);
final RexShuttle shuttle = new ReplaceSubQueryShuttle(e, target);
builder.filter(shuttle.apply(filter.getCondition()));
builder.project(fields(builder, filter.getRowType().getFieldCount()));
@@ -122,11 +124,12 @@ public abstract class HiveSubQueryRemoveRule extends RelOptRule{
builder.push(project.getInput());
final int fieldCount = builder.peek().getRowType().getFieldCount();
- Set<RelNode> corrScalarQueries = project.getCluster().getPlanner().getContext().unwrap(Set.class);
- boolean isCorrScalarQuery = corrScalarQueries.contains(e.rel);
+ SubqueryConf subqueryConfig = project.getCluster().getPlanner().getContext().unwrap(SubqueryConf.class);
+ boolean isCorrScalarQuery = subqueryConfig.getCorrScalarRexSQWithAgg().contains(e.rel);
+ boolean hasNoWindowingAndNoGby = subqueryConfig.getScalarAggWithoutGbyWindowing().contains(e.rel);
final RexNode target = apply(e, HiveFilter.getVariablesSet(e),
- logic, builder, 1, fieldCount, isCorrScalarQuery);
+ logic, builder, 1, fieldCount, isCorrScalarQuery, hasNoWindowingAndNoGby);
final RexShuttle shuttle = new ReplaceSubQueryShuttle(e, target);
builder.project(shuttle.apply(project.getProjects()),
project.getRowType().getFieldNames());
@@ -165,28 +168,32 @@ public abstract class HiveSubQueryRemoveRule extends RelOptRule{
protected RexNode apply(RexSubQuery e, Set<CorrelationId> variablesSet,
RelOptUtil.Logic logic,
HiveSubQRemoveRelBuilder builder, int inputCount, int offset,
- boolean isCorrScalarAgg) {
+ boolean isCorrScalarAgg,
+ boolean hasNoWindowingAndNoGby ) {
switch (e.getKind()) {
case SCALAR_QUERY:
- builder.push(e.rel);
- // returns single row/column
- builder.aggregate(builder.groupKey(),
- builder.count(false, "cnt"));
-
- SqlFunction countCheck = new SqlFunction("sq_count_check", SqlKind.OTHER_FUNCTION, ReturnTypes.BIGINT,
- InferTypes.RETURN_TYPE, OperandTypes.NUMERIC, SqlFunctionCategory.USER_DEFINED_FUNCTION);
-
- // we create FILTER (sq_count_check(count()) <= 1) instead of PROJECT because RelFieldTrimmer
- // ends up getting rid of Project since it is not used further up the tree
- builder.filter(builder.call(SqlStdOperatorTable.LESS_THAN_OR_EQUAL,
- builder.call(countCheck, builder.field("cnt")),
- builder.literal(1)));
- if( !variablesSet.isEmpty())
- {
- builder.join(JoinRelType.LEFT, builder.literal(true), variablesSet);
+ // if scalar query has aggregate and no windowing and no gby avoid adding sq_count_check
+ // since it is guaranteed to produce at most one row
+ if(!hasNoWindowingAndNoGby) {
+ builder.push(e.rel);
+ // returns single row/column
+ builder.aggregate(builder.groupKey(), builder.count(false, "cnt"));
+
+ SqlFunction countCheck =
+ new SqlFunction("sq_count_check", SqlKind.OTHER_FUNCTION, ReturnTypes.BIGINT,
+ InferTypes.RETURN_TYPE, OperandTypes.NUMERIC, SqlFunctionCategory.USER_DEFINED_FUNCTION);
+
+ // we create FILTER (sq_count_check(count()) <= 1) instead of PROJECT because RelFieldTrimmer
+ // ends up getting rid of Project since it is not used further up the tree
+ builder.filter(builder.call(SqlStdOperatorTable.LESS_THAN_OR_EQUAL,
+ builder.call(countCheck, builder.field("cnt")), builder.literal(1)));
+ if (!variablesSet.isEmpty()) {
+ builder.join(JoinRelType.LEFT, builder.literal(true), variablesSet);
+ } else
+ builder.join(JoinRelType.INNER, builder.literal(true), variablesSet);
+
+ offset++;
}
- else
- builder.join(JoinRelType.INNER, builder.literal(true), variablesSet);
if(isCorrScalarAgg) {
// Transformation :
// Outer Query Left Join (inner query) on correlated predicate and preserve rows only from left side.
@@ -218,10 +225,8 @@ public abstract class HiveSubQueryRemoveRule extends RelOptRule{
//Transformation is to left join for correlated predicates and inner join otherwise,
// but do a count on inner side before that to make sure it generates atmost 1 row.
-
builder.push(e.rel);
builder.join(JoinRelType.LEFT, builder.literal(true), variablesSet);
- offset++;
return field(builder, inputCount, offset);
case IN:
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index 5d640be..fa96e94 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -1299,6 +1299,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
// this is to keep track if a subquery is correlated and contains aggregate
// since this is special cased when it is rewritten in SubqueryRemoveRule
Set<RelNode> corrScalarRexSQWithAgg = new HashSet<RelNode>();
+ Set<RelNode> scalarAggNoGbyNoWin = new HashSet<RelNode>();
// TODO: Do we need to keep track of RR, ColNameToPosMap for every op or
// just last one.
@@ -1332,7 +1333,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
Boolean.FALSE.toString());
CalciteConnectionConfig calciteConfig = new CalciteConnectionConfigImpl(calciteConfigProperties);
HivePlannerContext confContext = new HivePlannerContext(algorithmsConf, registry, calciteConfig,
- corrScalarRexSQWithAgg);
+ corrScalarRexSQWithAgg, scalarAggNoGbyNoWin);
RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(confContext);
final RexBuilder rexBuilder = cluster.getRexBuilder();
final RelOptCluster optCluster = RelOptCluster.create(planner, rexBuilder);
@@ -2425,8 +2426,8 @@ public class CalcitePlanner extends SemanticAnalyzer {
}
private void subqueryRestrictionCheck(QB qb, ASTNode searchCond, RelNode srcRel,
- boolean forHavingClause,
- Set<ASTNode> corrScalarQueries) throws SemanticException {
+ boolean forHavingClause, Set<ASTNode> corrScalarQueries,
+ Set<ASTNode> scalarQueriesWithAggNoWinNoGby) throws SemanticException {
List<ASTNode> subQueriesInOriginalTree = SubQueryUtils.findSubQueries(searchCond);
ASTNode clonedSearchCond = (ASTNode) SubQueryUtils.adaptor.dupTree(searchCond);
@@ -2461,18 +2462,25 @@ public class CalcitePlanner extends SemanticAnalyzer {
String havingInputAlias = null;
- boolean isCorrScalarWithAgg = subQuery.subqueryRestrictionsCheck(inputRR, forHavingClause, havingInputAlias);
- if(isCorrScalarWithAgg) {
+ boolean [] subqueryConfig = {false, false};
+ subQuery.subqueryRestrictionsCheck(inputRR, forHavingClause,
+ havingInputAlias, subqueryConfig);
+ if(subqueryConfig[0]) {
corrScalarQueries.add(originalSubQueryAST);
}
+ if(subqueryConfig[1]) {
+ scalarQueriesWithAggNoWinNoGby.add(originalSubQueryAST);
+ }
}
}
private boolean genSubQueryRelNode(QB qb, ASTNode node, RelNode srcRel, boolean forHavingClause,
Map<ASTNode, RelNode> subQueryToRelNode) throws SemanticException {
Set<ASTNode> corrScalarQueriesWithAgg = new HashSet<ASTNode>();
+ Set<ASTNode> scalarQueriesWithAggNoWinNoGby= new HashSet<ASTNode>();
//disallow subqueries which HIVE doesn't currently support
- subqueryRestrictionCheck(qb, node, srcRel, forHavingClause, corrScalarQueriesWithAgg);
+ subqueryRestrictionCheck(qb, node, srcRel, forHavingClause, corrScalarQueriesWithAgg,
+ scalarQueriesWithAggNoWinNoGby);
Deque<ASTNode> stack = new ArrayDeque<ASTNode>();
stack.push(node);
@@ -2502,9 +2510,14 @@ public class CalcitePlanner extends SemanticAnalyzer {
subQueryToRelNode.put(next, subQueryRelNode);
//keep track of subqueries which are scalar, correlated and contains aggregate
// subquery expression. This will later be special cased in Subquery remove rule
+ // for correlated scalar queries with aggregate we have take care of the case where
+ // inner aggregate happens on empty result
if(corrScalarQueriesWithAgg.contains(next)) {
corrScalarRexSQWithAgg.add(subQueryRelNode);
}
+ if(scalarQueriesWithAggNoWinNoGby.contains(next)) {
+ scalarAggNoGbyNoWin.add(subQueryRelNode);
+ }
isSubQuery = true;
break;
default:
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
index ec52741..0097a04 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/QBSubQuery.java
@@ -526,9 +526,9 @@ public class QBSubQuery implements ISubQueryJoinInfo {
* @return true if it is correlated scalar subquery with an aggregate
* @throws SemanticException
*/
- boolean subqueryRestrictionsCheck(RowResolver parentQueryRR,
+ void subqueryRestrictionsCheck(RowResolver parentQueryRR,
boolean forHavingClause,
- String outerQueryAlias)
+ String outerQueryAlias, boolean [] subqueryConfig)
throws SemanticException {
ASTNode insertClause = getChildFromSubqueryAST("Insert", HiveParser.TOK_INSERT);
@@ -568,37 +568,35 @@ public class QBSubQuery implements ISubQueryJoinInfo {
hasCount = hasCount | ( r == 2 );
}
-
-
- ASTNode whereClause = SubQueryUtils.subQueryWhere(insertClause);
-
- if ( whereClause == null ) {
- return false;
- }
- ASTNode searchCond = (ASTNode) whereClause.getChild(0);
- List<ASTNode> conjuncts = new ArrayList<ASTNode>();
- SubQueryUtils.extractConjuncts(searchCond, conjuncts);
-
- ConjunctAnalyzer conjunctAnalyzer = new ConjunctAnalyzer(parentQueryRR,
- forHavingClause, outerQueryAlias);
-
+ // figure out correlation and presence of non-equi join predicate
boolean hasCorrelation = false;
boolean hasNonEquiJoinPred = false;
- for(ASTNode conjunctAST : conjuncts) {
- Conjunct conjunct = conjunctAnalyzer.analyzeConjunct(conjunctAST);
- if(conjunct.isCorrelated()){
- hasCorrelation = true;
- }
- if ( conjunct.eitherSideRefersBoth() && conjunctAST.getType() != HiveParser.EQUAL) {
- hasNonEquiJoinPred = true;
+
+ ASTNode whereClause = SubQueryUtils.subQueryWhere(insertClause);
+ if ( whereClause != null ) {
+ ASTNode searchCond = (ASTNode) whereClause.getChild(0);
+ List<ASTNode> conjuncts = new ArrayList<ASTNode>();
+ SubQueryUtils.extractConjuncts(searchCond, conjuncts);
+
+ ConjunctAnalyzer conjunctAnalyzer =
+ new ConjunctAnalyzer(parentQueryRR, forHavingClause, outerQueryAlias);
+
+ for (ASTNode conjunctAST : conjuncts) {
+ Conjunct conjunct = conjunctAnalyzer.analyzeConjunct(conjunctAST);
+ if (conjunct.isCorrelated()) {
+ hasCorrelation = true;
+ }
+ if (conjunct.eitherSideRefersBoth() && conjunctAST.getType() != HiveParser.EQUAL) {
+ hasNonEquiJoinPred = true;
+ }
}
}
+
+ // figure out if there is group by
boolean noImplicityGby = true;
- if ( insertClause.getChild(1).getChildCount() > 3 &&
- insertClause.getChild(1).getChild(3).getType() == HiveParser.TOK_GROUPBY ) {
- if((ASTNode) insertClause.getChild(1).getChild(3) != null){
+ if ( insertClause.getChildCount() > 3 &&
+ insertClause.getChild(3).getType() == HiveParser.TOK_GROUPBY ) {
noImplicityGby = false;
- }
}
/*
@@ -643,22 +641,24 @@ public class QBSubQuery implements ISubQueryJoinInfo {
subQueryAST,
"Scalar subqueries with aggregate cannot have non-equi join predicate"));
}
+ if(!hasWindowing) {
+ subqueryConfig[1] = true;
+ }
if(hasCorrelation) {
- return true;
+ subqueryConfig[0] = true;
}
}
else if(operator.getType() == SubQueryType.IN) {
if(hasCount && hasCorrelation) {
- return true;
+ subqueryConfig[0] = true;
}
}
else if (operator.getType() == SubQueryType.NOT_IN) {
if(hasCorrelation) {
- return true;
+ subqueryConfig[0] = true;
}
}
}
- return false;
}
void validateAndRewriteAST(RowResolver outerQueryRR,
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java
index 4823950..884e034 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java
@@ -61,7 +61,8 @@ public class TestCBORuleFiredOnlyOnce {
// Create rules registry to not trigger a rule more than once
HiveRulesRegistry registry = new HiveRulesRegistry();
- HivePlannerContext context = new HivePlannerContext(null, registry, null, null);
+ HivePlannerContext context = new HivePlannerContext(null, registry, null,
+ null, null);
HepPlanner planner = new HepPlanner(programBuilder.build(), context);
// Cluster
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/test/queries/clientnegative/subquery_scalar_corr_multi_rows.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientnegative/subquery_scalar_corr_multi_rows.q b/ql/src/test/queries/clientnegative/subquery_scalar_corr_multi_rows.q
index e9ea703..e71a60d 100644
--- a/ql/src/test/queries/clientnegative/subquery_scalar_corr_multi_rows.q
+++ b/ql/src/test/queries/clientnegative/subquery_scalar_corr_multi_rows.q
@@ -1,2 +1,3 @@
-- inner query produces more than one row
-select * from part where p_size > (select count(*) from part p where p.p_mfgr = part.p_mfgr group by p_type);
\ No newline at end of file
+select * from part where p_size >
+ (select count(*) from part p where p.p_mfgr = part.p_mfgr group by p_type);
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/a113ede9/ql/src/test/results/clientnegative/subquery_scalar_corr_multi_rows.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientnegative/subquery_scalar_corr_multi_rows.q.out b/ql/src/test/results/clientnegative/subquery_scalar_corr_multi_rows.q.out
index 3235048..8377085 100644
--- a/ql/src/test/results/clientnegative/subquery_scalar_corr_multi_rows.q.out
+++ b/ql/src/test/results/clientnegative/subquery_scalar_corr_multi_rows.q.out
@@ -1,4 +1,5 @@
-PREHOOK: query: select * from part where p_size > (select count(*) from part p where p.p_mfgr = part.p_mfgr group by p_type)
+PREHOOK: query: select * from part where p_size >
+ (select count(*) from part p where p.p_mfgr = part.p_mfgr group by p_type)
PREHOOK: type: QUERY
PREHOOK: Input: default@part
#### A masked pattern was here ####