You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2019/08/23 20:26:38 UTC
[hive] branch master updated: HIVE-22074: Slow compilation due to
IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
This is an automated email from the ASF dual-hosted git repository.
vgarg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new ece9a54 HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
ece9a54 is described below
commit ece9a54b78e41e2acfe9cd5df45fcddf3a4b05ab
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Fri Aug 23 13:25:16 2019 -0700
HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 2 +
.../calcite/translator/RexNodeConverter.java | 25 +++-
.../hadoop/hive/ql/parse/CalcitePlanner.java | 11 +-
.../hadoop/hive/ql/parse/TypeCheckProcFactory.java | 26 ++--
.../queries/clientpositive/in_typecheck_char.q | 12 ++
.../results/clientpositive/in_typecheck_char.q.out | 146 +++++++++++++++++++++
6 files changed, 205 insertions(+), 17 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 9fd7dca..69408f6 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2270,6 +2270,8 @@ public class HiveConf extends Configuration {
"Whether to transform OR clauses in Filter operators into IN clauses"),
HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 2,
"Minimum number of OR clauses needed to transform into IN clauses"),
+ HIVEOPT_TRANSFORM_IN_MAXNODES("hive.optimize.transform.in.maxnodes", 16,
+ "Maximum number of IN expressions beyond which IN will not be transformed into OR clause"),
HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true,
"Whether to transform count distinct into two stages"),
HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true,
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
index 100ee0b..63882fd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
@@ -142,6 +142,7 @@ public class RexNodeConverter {
private final RowResolver outerRR;
private final ImmutableMap<String, Integer> outerNameToPosMap;
private int correlatedId;
+ private final int maxNodesForInToOrTransformation;
//Constructor used by HiveRexExecutorImpl
public RexNodeConverter(RelOptCluster cluster) {
@@ -151,13 +152,15 @@ public class RexNodeConverter {
//subqueries will need outer query's row resolver
public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType,
ImmutableMap<String, Integer> outerNameToPosMap,
- ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR, int offset, boolean flattenExpr, int correlatedId) {
+ ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR,
+ int maxNodesForInToOrTransformation, int offset, boolean flattenExpr, int correlatedId) {
this.cluster = cluster;
this.inputCtxs = ImmutableList.of(new InputCtx(inpDataType, nameToPosMap, hiveRR, offset));
this.flattenExpr = flattenExpr;
this.outerRR = outerRR;
this.outerNameToPosMap = outerNameToPosMap;
this.correlatedId = correlatedId;
+ this.maxNodesForInToOrTransformation = maxNodesForInToOrTransformation;
}
public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType,
@@ -167,6 +170,7 @@ public class RexNodeConverter {
this.flattenExpr = flattenExpr;
this.outerRR = null;
this.outerNameToPosMap = null;
+ this.maxNodesForInToOrTransformation = 0;
}
public RexNodeConverter(RelOptCluster cluster, List<InputCtx> inpCtxLst, boolean flattenExpr) {
@@ -175,6 +179,7 @@ public class RexNodeConverter {
this.flattenExpr = flattenExpr;
this.outerRR = null;
this.outerNameToPosMap = null;
+ this.maxNodesForInToOrTransformation = 0;
}
public RexNode convert(ExprNodeDesc expr) throws SemanticException {
@@ -423,12 +428,22 @@ public class RexNodeConverter {
// from IN [A,B] => EQUALS [A,B]
// except complex types
calciteOp = SqlStdOperatorTable.EQUALS;
- } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)) {
+ } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)){
// if it is more than an single item in an IN clause,
// transform from IN [A,B,C] => OR [EQUALS [A,B], EQUALS [A,C]]
// except complex types
- childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst);
- calciteOp = SqlStdOperatorTable.OR;
+ // Rewrite to OR is done only if number of operands are less than
+ // the threshold configured
+ boolean rewriteToOr = true;
+ if(this.maxNodesForInToOrTransformation != 0) {
+ if(childRexNodeLst.size() > this.maxNodesForInToOrTransformation) {
+ rewriteToOr = false;
+ }
+ }
+ if(rewriteToOr) {
+ childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst);
+ calciteOp = SqlStdOperatorTable.OR;
+ }
}
} else if (calciteOp.getKind() == SqlKind.COALESCE &&
childRexNodeLst.size() > 1) {
@@ -576,7 +591,7 @@ public class RexNodeConverter {
RexNode node = nodes.get(i);
if ((i % 2 == 1 || i == nodes.size() - 1)
&& !node.getType().getSqlTypeName().equals(retType.getSqlTypeName())) {
- newNodes.add(cluster.getRexBuilder().makeCast(retType, node));
+ newNodes.add(cluster.getRexBuilder().makeCast(retType, node));
} else {
newNodes.add(node);
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index 212d27a..8d9718f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -3195,6 +3195,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
.get(srcRel);
RexNode convertedFilterExpr = new RexNodeConverter(cluster, srcRel.getRowType(),
outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), outerRR,
+ HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES),
0, true, subqueryId).convert(filterCondn);
RexNode factoredFilterExpr = RexUtil
.pullFactors(cluster.getRexBuilder(), convertedFilterExpr);
@@ -3428,8 +3429,9 @@ public class CalcitePlanner extends SemanticAnalyzer {
ImmutableMap<String, Integer> hiveColNameCalcitePosMap = this.relToHiveColNameCalcitePosMap
.get(srcRel);
RexNode convertedFilterLHS = new RexNodeConverter(cluster, srcRel.getRowType(),
- outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel),
- outerRR, 0, true, subqueryId).convert(subQueryExpr);
+ outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel),
+ outerRR, HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES),
+ 0, true, subqueryId).convert(subQueryExpr);
RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
srcRel, convertedFilterLHS);
@@ -4695,8 +4697,9 @@ public class CalcitePlanner extends SemanticAnalyzer {
List<RexNode> calciteColLst = new ArrayList<RexNode>();
RexNodeConverter rexNodeConv = new RexNodeConverter(cluster, srcRel.getRowType(),
- outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel),
- outerRR, 0, false, subqueryId);
+ outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel),
+ outerRR, HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES),
+ 0, false, subqueryId);
for (ExprNodeDesc colExpr : col_list) {
calciteColLst.add(rexNodeConv.convert(colExpr));
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
index 0c81986..a4c1b9a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
@@ -1220,16 +1220,26 @@ public class TypeCheckProcFactory {
}
outputOpList.add(nullConst);
}
+
if (!ctx.isCBOExecuted()) {
- ArrayList<ExprNodeDesc> orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children);
- if (orOperands != null) {
- if (orOperands.size() == 1) {
- orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false));
+
+ HiveConf conf;
+ try {
+ conf = Hive.get().getConf();
+ } catch (HiveException e) {
+ throw new SemanticException(e);
+ }
+ if( children.size() <= HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES)) {
+ ArrayList<ExprNodeDesc> orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children);
+ if (orOperands != null) {
+ if (orOperands.size() == 1) {
+ orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false));
+ }
+ funcText = "or";
+ genericUDF = new GenericUDFOPOr();
+ children.clear();
+ children.addAll(orOperands);
}
- funcText = "or";
- genericUDF = new GenericUDFOPOr();
- children.clear();
- children.addAll(orOperands);
}
}
}
diff --git a/ql/src/test/queries/clientpositive/in_typecheck_char.q b/ql/src/test/queries/clientpositive/in_typecheck_char.q
index 3955c4b..a144d51 100644
--- a/ql/src/test/queries/clientpositive/in_typecheck_char.q
+++ b/ql/src/test/queries/clientpositive/in_typecheck_char.q
@@ -22,3 +22,15 @@ select 'expected 2',count(*) from ax where (s,t) in (('a','a'),(null, 'bb'));
-- this is right now broken; HIVE-20779 should fix it
explain select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null;
select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null;
+
+set hive.optimize.point.lookup=false;
+explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
+explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
+
+set hive.optimize.transform.in.maxnodes=20;
+explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
+explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
diff --git a/ql/src/test/results/clientpositive/in_typecheck_char.q.out b/ql/src/test/results/clientpositive/in_typecheck_char.q.out
index cb9e777..4d59ddf 100644
--- a/ql/src/test/results/clientpositive/in_typecheck_char.q.out
+++ b/ql/src/test/results/clientpositive/in_typecheck_char.q.out
@@ -262,3 +262,149 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@ax
#### A masked pattern was here ####
expected 1 1
+PREHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+ HiveFilter(condition=[IN($1, _UTF-16LE'a ', _UTF-16LE'bb ', _UTF-16LE'aa ', _UTF-16LE'bbb ', _UTF-16LE'ab ', _UTF-16LE'ba ', _UTF-16LE'aaa ', _UTF-16LE'bbb ', _UTF-16LE'abc ', _UTF-16LE'bc ', _UTF-16LE'ac ', _UTF-16LE'bca ', _UTF-16LE'cab ', _UTF-16LE'cb ', _UTF-16LE'ca ', _UTF-16LE'cbc ', _UTF-16LE'z ')])
+ HiveTableScan(table=[[default, ax]], table:alias=[ax])
+
+PREHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: ax
+ filterExpr: (t) IN ('a ', 'bb ', 'aa ', 'bbb ', 'ab ', 'ba ', 'aaa ', 'bbb ', 'abc ', 'bc ', 'ac ', 'bca ', 'cab ', 'cb ', 'ca ', 'cbc ', 'z ') (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (t) IN ('a ', 'bb ', 'aa ', 'bbb ', 'ab ', 'ba ', 'aaa ', 'bbb ', 'abc ', 'bc ', 'ac ', 'bca ', 'cab ', 'cb ', 'ca ', 'cbc ', 'z ') (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count()
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+ HiveFilter(condition=[OR(=($1, _UTF-16LE'a '), =($1, _UTF-16LE'bb '), =($1, _UTF-16LE'aa '), =($1, _UTF-16LE'bbb '), =($1, _UTF-16LE'ab '), =($1, _UTF-16LE'ba '), =($1, _UTF-16LE'aaa '), =($1, _UTF-16LE'abc '), =($1, _UTF-16LE'bc '), =($1, _UTF-16LE'ac '), =($1, _UTF-16LE'bca '), =($1, _UTF-16LE'cab '), =($1, _UTF-16LE'cb '), =($1, _UTF-16LE'ca '), =($1, _UTF-16LE'cbc '), =($1, _UTF-16LE [...]
+ HiveTableScan(table=[[default, ax]], table:alias=[ax])
+
+PREHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: ax
+ filterExpr: ((t = 'a ') or (t = 'bb ') or (t = 'aa ') or (t = 'bbb ') or (t = 'ab ') or (t = 'ba ') or (t = 'aaa ') or (t = 'abc ') or (t = 'bc ') or (t = 'ac ') or (t = 'bca ') or (t = 'cab ') or (t = 'cb ') or (t = 'ca ') or (t = 'cbc ') or (t = 'z ')) (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((t = 'a ') or (t = 'bb ') or (t = 'aa ') or (t = 'bbb ') or (t = 'ab ') or (t = 'ba ') or (t = 'aaa ') or (t = 'abc ') or (t = 'bc ') or (t = 'ac ') or (t = 'bca ') or (t = 'cab ') or (t = 'cb ') or (t = 'ca ') or (t = 'cbc ') or (t = 'z ')) (type: boolean)
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count()
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+