You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2019/08/23 20:26:38 UTC

[hive] branch master updated: HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez)

This is an automated email from the ASF dual-hosted git repository.

vgarg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new ece9a54  HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
ece9a54 is described below

commit ece9a54b78e41e2acfe9cd5df45fcddf3a4b05ab
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Fri Aug 23 13:25:16 2019 -0700

    HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
---
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |   2 +
 .../calcite/translator/RexNodeConverter.java       |  25 +++-
 .../hadoop/hive/ql/parse/CalcitePlanner.java       |  11 +-
 .../hadoop/hive/ql/parse/TypeCheckProcFactory.java |  26 ++--
 .../queries/clientpositive/in_typecheck_char.q     |  12 ++
 .../results/clientpositive/in_typecheck_char.q.out | 146 +++++++++++++++++++++
 6 files changed, 205 insertions(+), 17 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 9fd7dca..69408f6 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2270,6 +2270,8 @@ public class HiveConf extends Configuration {
          "Whether to transform OR clauses in Filter operators into IN clauses"),
     HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 2,
              "Minimum number of OR clauses needed to transform into IN clauses"),
+    HIVEOPT_TRANSFORM_IN_MAXNODES("hive.optimize.transform.in.maxnodes", 16,
+        "Maximum number of IN expressions beyond which IN will not be transformed into OR clause"),
     HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true,
                  "Whether to transform count distinct into two stages"),
    HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true,
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
index 100ee0b..63882fd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java
@@ -142,6 +142,7 @@ public class RexNodeConverter {
   private final RowResolver             outerRR;
   private final ImmutableMap<String, Integer> outerNameToPosMap;
   private int correlatedId;
+  private final int maxNodesForInToOrTransformation;
 
   //Constructor used by HiveRexExecutorImpl
   public RexNodeConverter(RelOptCluster cluster) {
@@ -151,13 +152,15 @@ public class RexNodeConverter {
   //subqueries will need outer query's row resolver
   public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType,
       ImmutableMap<String, Integer> outerNameToPosMap,
-      ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR, int offset, boolean flattenExpr, int correlatedId) {
+      ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR,
+      int maxNodesForInToOrTransformation, int offset, boolean flattenExpr, int correlatedId) {
     this.cluster = cluster;
     this.inputCtxs = ImmutableList.of(new InputCtx(inpDataType, nameToPosMap, hiveRR, offset));
     this.flattenExpr = flattenExpr;
     this.outerRR = outerRR;
     this.outerNameToPosMap = outerNameToPosMap;
     this.correlatedId = correlatedId;
+    this.maxNodesForInToOrTransformation = maxNodesForInToOrTransformation;
   }
 
   public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType,
@@ -167,6 +170,7 @@ public class RexNodeConverter {
     this.flattenExpr = flattenExpr;
     this.outerRR = null;
     this.outerNameToPosMap = null;
+    this.maxNodesForInToOrTransformation = 0;
   }
 
   public RexNodeConverter(RelOptCluster cluster, List<InputCtx> inpCtxLst, boolean flattenExpr) {
@@ -175,6 +179,7 @@ public class RexNodeConverter {
     this.flattenExpr = flattenExpr;
     this.outerRR  = null;
     this.outerNameToPosMap = null;
+    this.maxNodesForInToOrTransformation = 0;
   }
 
   public RexNode convert(ExprNodeDesc expr) throws SemanticException {
@@ -423,12 +428,22 @@ public class RexNodeConverter {
           // from IN [A,B] => EQUALS [A,B]
           // except complex types
           calciteOp = SqlStdOperatorTable.EQUALS;
-        } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)) {
+        } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)){
           // if it is more than an single item in an IN clause,
           // transform from IN [A,B,C] => OR [EQUALS [A,B], EQUALS [A,C]]
           // except complex types
-          childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst);
-          calciteOp = SqlStdOperatorTable.OR;
+          // Rewrite to OR is done only if number of operands are less than
+          // the threshold configured
+          boolean rewriteToOr = true;
+          if(this.maxNodesForInToOrTransformation != 0) {
+            if(childRexNodeLst.size() > this.maxNodesForInToOrTransformation) {
+              rewriteToOr = false;
+            }
+          }
+          if(rewriteToOr) {
+            childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst);
+            calciteOp = SqlStdOperatorTable.OR;
+          }
         }
       } else if (calciteOp.getKind() == SqlKind.COALESCE &&
           childRexNodeLst.size() > 1) {
@@ -576,7 +591,7 @@ public class RexNodeConverter {
       RexNode node = nodes.get(i);
       if ((i % 2 == 1 || i == nodes.size() - 1)
           && !node.getType().getSqlTypeName().equals(retType.getSqlTypeName())) {
-          newNodes.add(cluster.getRexBuilder().makeCast(retType, node));
+        newNodes.add(cluster.getRexBuilder().makeCast(retType, node));
       } else {
         newNodes.add(node);
       }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index 212d27a..8d9718f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -3195,6 +3195,7 @@ public class CalcitePlanner extends SemanticAnalyzer {
           .get(srcRel);
       RexNode convertedFilterExpr = new RexNodeConverter(cluster, srcRel.getRowType(),
           outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), outerRR,
+      HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES),
               0, true, subqueryId).convert(filterCondn);
       RexNode factoredFilterExpr = RexUtil
           .pullFactors(cluster.getRexBuilder(), convertedFilterExpr);
@@ -3428,8 +3429,9 @@ public class CalcitePlanner extends SemanticAnalyzer {
         ImmutableMap<String, Integer> hiveColNameCalcitePosMap = this.relToHiveColNameCalcitePosMap
                 .get(srcRel);
         RexNode convertedFilterLHS = new RexNodeConverter(cluster, srcRel.getRowType(),
-                outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel),
-                outerRR, 0, true, subqueryId).convert(subQueryExpr);
+            outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel),
+            outerRR, HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES),
+            0, true, subqueryId).convert(subQueryExpr);
 
         RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
                 srcRel, convertedFilterLHS);
@@ -4695,8 +4697,9 @@ public class CalcitePlanner extends SemanticAnalyzer {
       List<RexNode> calciteColLst = new ArrayList<RexNode>();
 
       RexNodeConverter rexNodeConv = new RexNodeConverter(cluster, srcRel.getRowType(),
-              outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel),
-              outerRR, 0, false, subqueryId);
+          outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel),
+          outerRR, HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES),
+          0, false, subqueryId);
       for (ExprNodeDesc colExpr : col_list) {
         calciteColLst.add(rexNodeConv.convert(colExpr));
       }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
index 0c81986..a4c1b9a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java
@@ -1220,16 +1220,26 @@ public class TypeCheckProcFactory {
             }
             outputOpList.add(nullConst);
           }
+
           if (!ctx.isCBOExecuted()) {
-            ArrayList<ExprNodeDesc> orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children);
-            if (orOperands != null) {
-              if (orOperands.size() == 1) {
-                orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false));
+
+            HiveConf conf;
+            try {
+              conf = Hive.get().getConf();
+            } catch (HiveException e) {
+              throw new SemanticException(e);
+            }
+            if( children.size() <= HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES)) {
+              ArrayList<ExprNodeDesc> orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children);
+              if (orOperands != null) {
+                if (orOperands.size() == 1) {
+                  orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false));
+                }
+                funcText = "or";
+                genericUDF = new GenericUDFOPOr();
+                children.clear();
+                children.addAll(orOperands);
               }
-              funcText = "or";
-              genericUDF = new GenericUDFOPOr();
-              children.clear();
-              children.addAll(orOperands);
             }
           }
         }
diff --git a/ql/src/test/queries/clientpositive/in_typecheck_char.q b/ql/src/test/queries/clientpositive/in_typecheck_char.q
index 3955c4b..a144d51 100644
--- a/ql/src/test/queries/clientpositive/in_typecheck_char.q
+++ b/ql/src/test/queries/clientpositive/in_typecheck_char.q
@@ -22,3 +22,15 @@ select 'expected 2',count(*) from ax where (s,t) in (('a','a'),(null, 'bb'));
 -- this is right now broken; HIVE-20779 should fix it
 explain select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null;
 select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null;
+
+set hive.optimize.point.lookup=false;
+explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
+explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
+
+set hive.optimize.transform.in.maxnodes=20;
+explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
+explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z');
diff --git a/ql/src/test/results/clientpositive/in_typecheck_char.q.out b/ql/src/test/results/clientpositive/in_typecheck_char.q.out
index cb9e777..4d59ddf 100644
--- a/ql/src/test/results/clientpositive/in_typecheck_char.q.out
+++ b/ql/src/test/results/clientpositive/in_typecheck_char.q.out
@@ -262,3 +262,149 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@ax
 #### A masked pattern was here ####
 expected 1	1
+PREHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+  HiveFilter(condition=[IN($1, _UTF-16LE'a         ', _UTF-16LE'bb        ', _UTF-16LE'aa        ', _UTF-16LE'bbb       ', _UTF-16LE'ab        ', _UTF-16LE'ba        ', _UTF-16LE'aaa       ', _UTF-16LE'bbb       ', _UTF-16LE'abc       ', _UTF-16LE'bc        ', _UTF-16LE'ac        ', _UTF-16LE'bca       ', _UTF-16LE'cab       ', _UTF-16LE'cb        ', _UTF-16LE'ca        ', _UTF-16LE'cbc       ', _UTF-16LE'z         ')])
+    HiveTableScan(table=[[default, ax]], table:alias=[ax])
+
+PREHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: ax
+            filterExpr: (t) IN ('a         ', 'bb        ', 'aa        ', 'bbb       ', 'ab        ', 'ba        ', 'aaa       ', 'bbb       ', 'abc       ', 'bc        ', 'ac        ', 'bca       ', 'cab       ', 'cb        ', 'ca        ', 'cbc       ', 'z         ') (type: boolean)
+            Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+            Filter Operator
+              predicate: (t) IN ('a         ', 'bb        ', 'aa        ', 'bbb       ', 'ab        ', 'ba        ', 'aaa       ', 'bbb       ', 'abc       ', 'bc        ', 'ac        ', 'bca       ', 'cab       ', 'cb        ', 'ca        ', 'cbc       ', 'z         ') (type: boolean)
+              Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+              Select Operator
+                Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+                Group By Operator
+                  aggregations: count()
+                  minReductionHashAggr: 0.99
+                  mode: hash
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+                  Reduce Output Operator
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+                    value expressions: _col0 (type: bigint)
+      Execution mode: vectorized
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+  HiveFilter(condition=[OR(=($1, _UTF-16LE'a         '), =($1, _UTF-16LE'bb        '), =($1, _UTF-16LE'aa        '), =($1, _UTF-16LE'bbb       '), =($1, _UTF-16LE'ab        '), =($1, _UTF-16LE'ba        '), =($1, _UTF-16LE'aaa       '), =($1, _UTF-16LE'abc       '), =($1, _UTF-16LE'bc        '), =($1, _UTF-16LE'ac        '), =($1, _UTF-16LE'bca       '), =($1, _UTF-16LE'cab       '), =($1, _UTF-16LE'cb        '), =($1, _UTF-16LE'ca        '), =($1, _UTF-16LE'cbc       '), =($1, _UTF-16LE [...]
+    HiveTableScan(table=[[default, ax]], table:alias=[ax])
+
+PREHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ax
+#### A masked pattern was here ####
+POSTHOOK: query: explain select count(*) from ax where t in
+('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ax
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: ax
+            filterExpr: ((t = 'a         ') or (t = 'bb        ') or (t = 'aa        ') or (t = 'bbb       ') or (t = 'ab        ') or (t = 'ba        ') or (t = 'aaa       ') or (t = 'abc       ') or (t = 'bc        ') or (t = 'ac        ') or (t = 'bca       ') or (t = 'cab       ') or (t = 'cb        ') or (t = 'ca        ') or (t = 'cbc       ') or (t = 'z         ')) (type: boolean)
+            Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+            Filter Operator
+              predicate: ((t = 'a         ') or (t = 'bb        ') or (t = 'aa        ') or (t = 'bbb       ') or (t = 'ab        ') or (t = 'ba        ') or (t = 'aaa       ') or (t = 'abc       ') or (t = 'bc        ') or (t = 'ac        ') or (t = 'bca       ') or (t = 'cab       ') or (t = 'cb        ') or (t = 'ca        ') or (t = 'cbc       ') or (t = 'z         ')) (type: boolean)
+              Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+              Select Operator
+                Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+                Group By Operator
+                  aggregations: count()
+                  minReductionHashAggr: 0.99
+                  mode: hash
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+                  Reduce Output Operator
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+                    value expressions: _col0 (type: bigint)
+      Execution mode: vectorized
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+