You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2019/07/23 04:41:25 UTC
[hive] branch master updated: HIVE-22003: Shared work optimizer may
leave semijoin branches in plan that are not used (Jesus Camacho Rodriguez,
reviewed by Vineet Garg)
This is an automated email from the ASF dual-hosted git repository.
jcamacho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new bf6b988 HIVE-22003: Shared work optimizer may leave semijoin branches in plan that are not used (Jesus Camacho Rodriguez, reviewed by Vineet Garg)
bf6b988 is described below
commit bf6b98876cb8163839e6c108fa1b06219f72ef0d
Author: Jesus Camacho Rodriguez <jc...@apache.org>
AuthorDate: Tue Jul 16 19:37:31 2019 -0700
HIVE-22003: Shared work optimizer may leave semijoin branches in plan that are not used (Jesus Camacho Rodriguez, reviewed by Vineet Garg)
Close apache/hive#729
---
.../test/resources/testconfiguration.properties | 3 +-
.../hive/ql/optimizer/SharedWorkOptimizer.java | 149 ++++++--
ql/src/test/queries/clientpositive/perf/query1b.q | 37 ++
.../perf/tez/constraints/query1b.q.out | 395 ++++++++++++++++++++
.../perf/tez/constraints/query32.q.out | 4 +-
.../perf/tez/constraints/query92.q.out | 4 +-
.../results/clientpositive/perf/tez/query1b.q.out | 397 +++++++++++++++++++++
.../results/clientpositive/perf/tez/query32.q.out | 4 +-
.../results/clientpositive/perf/tez/query65.q.out | 2 +-
.../results/clientpositive/perf/tez/query92.q.out | 4 +-
10 files changed, 961 insertions(+), 38 deletions(-)
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 1c7905d..dcd7056 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -1768,7 +1768,8 @@ spark.only.query.negative.files=spark_job_max_tasks.q,\
tez.perf.disabled.query.files=mv_query44.q,\
mv_query67.q
-spark.perf.disabled.query.files=query14.q,\
+spark.perf.disabled.query.files=query1b.q,\
+ query14.q,\
query64.q,\
cbo_query1.q,\
cbo_ext_query1.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java
index 46a34b9..4c27c5b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java
@@ -58,6 +58,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo;
import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc;
@@ -73,6 +74,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFInBloomFilter;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -357,33 +359,63 @@ public class SharedWorkOptimizer extends Transform {
LOG.debug("Merging subtree starting at {} into subtree starting at {}",
discardableTsOp, retainableTsOp);
} else {
+ ExprNodeDesc newRetainableTsFilterExpr = null;
+ List<ExprNodeDesc> semijoinExprNodes = new ArrayList<>();
if (retainableTsOp.getConf().getFilterExpr() != null) {
+ // Gather SJ expressions and normal expressions
+ List<ExprNodeDesc> allExprNodesExceptSemijoin = new ArrayList<>();
+ splitExpressions(retainableTsOp.getConf().getFilterExpr(),
+ allExprNodesExceptSemijoin, semijoinExprNodes);
+ // Create new expressions
+ if (allExprNodesExceptSemijoin.size() > 1) {
+ newRetainableTsFilterExpr = ExprNodeGenericFuncDesc.newInstance(
+ new GenericUDFOPAnd(), allExprNodesExceptSemijoin);
+ } else if (allExprNodesExceptSemijoin.size() > 0 &&
+ allExprNodesExceptSemijoin.get(0) instanceof ExprNodeGenericFuncDesc) {
+ newRetainableTsFilterExpr = allExprNodesExceptSemijoin.get(0);
+ }
// Push filter on top of children for retainable
pushFilterToTopOfTableScan(optimizerCache, retainableTsOp);
}
+ ExprNodeDesc newDiscardableTsFilterExpr = null;
if (discardableTsOp.getConf().getFilterExpr() != null) {
+ // If there is a single discardable operator, it is a TableScanOperator
+ // and it means that we will merge filter expressions for it. Thus, we
+ // might need to remove DPP predicates before doing that
+ List<ExprNodeDesc> allExprNodesExceptSemijoin = new ArrayList<>();
+ splitExpressions(discardableTsOp.getConf().getFilterExpr(),
+ allExprNodesExceptSemijoin, new ArrayList<>());
+ // Create new expressions
+ if (allExprNodesExceptSemijoin.size() > 1) {
+ newDiscardableTsFilterExpr = ExprNodeGenericFuncDesc.newInstance(
+ new GenericUDFOPAnd(), allExprNodesExceptSemijoin);
+ } else if (allExprNodesExceptSemijoin.size() > 0 &&
+ allExprNodesExceptSemijoin.get(0) instanceof ExprNodeGenericFuncDesc) {
+ newDiscardableTsFilterExpr = allExprNodesExceptSemijoin.get(0);
+ }
+ // Remove and add semijoin filter from expressions
+ replaceSemijoinExpressions(discardableTsOp, semijoinExprNodes);
// Push filter on top of children for discardable
pushFilterToTopOfTableScan(optimizerCache, discardableTsOp);
}
// Obtain filter for shared TS operator
ExprNodeGenericFuncDesc exprNode = null;
- if (retainableTsOp.getConf().getFilterExpr() != null && discardableTsOp.getConf().getFilterExpr() != null) {
+ if (newRetainableTsFilterExpr != null && newDiscardableTsFilterExpr != null) {
// Combine
- exprNode = retainableTsOp.getConf().getFilterExpr();
- ExprNodeGenericFuncDesc tsExprNode = discardableTsOp.getConf().getFilterExpr();
- if (!exprNode.isSame(tsExprNode)) {
+ exprNode = (ExprNodeGenericFuncDesc) newRetainableTsFilterExpr;
+ if (!exprNode.isSame(newDiscardableTsFilterExpr)) {
// We merge filters from previous scan by ORing with filters from current scan
if (exprNode.getGenericUDF() instanceof GenericUDFOPOr) {
List<ExprNodeDesc> newChildren = new ArrayList<>(exprNode.getChildren().size() + 1);
for (ExprNodeDesc childExprNode : exprNode.getChildren()) {
- if (childExprNode.isSame(tsExprNode)) {
+ if (childExprNode.isSame(newDiscardableTsFilterExpr)) {
// We do not need to do anything, it is in the OR expression
break;
}
newChildren.add(childExprNode);
}
if (exprNode.getChildren().size() == newChildren.size()) {
- newChildren.add(tsExprNode);
+ newChildren.add(newDiscardableTsFilterExpr);
exprNode = ExprNodeGenericFuncDesc.newInstance(
new GenericUDFOPOr(),
newChildren);
@@ -391,10 +423,22 @@ public class SharedWorkOptimizer extends Transform {
} else {
exprNode = ExprNodeGenericFuncDesc.newInstance(
new GenericUDFOPOr(),
- Arrays.<ExprNodeDesc>asList(exprNode, tsExprNode));
+ Arrays.asList(exprNode, newDiscardableTsFilterExpr));
}
}
}
+ // Create expression node that will be used for the retainable table scan
+ if (!semijoinExprNodes.isEmpty()) {
+ if (exprNode != null) {
+ semijoinExprNodes.add(0, exprNode);
+ }
+ if (semijoinExprNodes.size() > 1) {
+ exprNode = ExprNodeGenericFuncDesc.newInstance(
+ new GenericUDFOPAnd(), semijoinExprNodes);
+ } else {
+ exprNode = (ExprNodeGenericFuncDesc) semijoinExprNodes.get(0);
+ }
+ }
// Replace filter
retainableTsOp.getConf().setFilterExpr(exprNode);
// Replace table scan operator
@@ -443,26 +487,6 @@ public class SharedWorkOptimizer extends Transform {
OperatorUtils.removeOperator(op);
optimizerCache.removeOp(op);
removedOps.add(op);
- if (sr.discardableOps.size() == 1) {
- // If there is a single discardable operator, it is a TableScanOperator
- // and it means that we have merged filter expressions for it. Thus, we
- // might need to remove DPP predicates from the retainable TableScanOperator
- Collection<Operator<?>> c =
- optimizerCache.tableScanToDPPSource.get((TableScanOperator) op);
- for (Operator<?> dppSource : c) {
- if (dppSource instanceof ReduceSinkOperator) {
- GenTezUtils.removeSemiJoinOperator(pctx,
- (ReduceSinkOperator) dppSource,
- (TableScanOperator) sr.retainableOps.get(0));
- optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op);
- } else if (dppSource instanceof AppMasterEventOperator) {
- GenTezUtils.removeSemiJoinOperator(pctx,
- (AppMasterEventOperator) dppSource,
- (TableScanOperator) sr.retainableOps.get(0));
- optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op);
- }
- }
- }
LOG.debug("Operator removed: {}", op);
}
@@ -486,6 +510,75 @@ public class SharedWorkOptimizer extends Transform {
return mergedExecuted;
}
+ private static void replaceSemijoinExpressions(TableScanOperator tsOp, List<ExprNodeDesc> semijoinExprNodes) {
+ ExprNodeDesc constNode = new ExprNodeConstantDesc(
+ TypeInfoFactory.booleanTypeInfo, Boolean.TRUE);
+ // TS operator
+ if (tsOp.getConf().getFilterExpr() != null) {
+ ExprNodeDesc tsFilterExpr = tsOp.getConf().getFilterExpr();
+ if (FunctionRegistry.isOpAnd(tsFilterExpr)) {
+ tsFilterExpr.getChildren().removeIf(SharedWorkOptimizer::isSemijoinExpr);
+ tsFilterExpr.getChildren().addAll(semijoinExprNodes);
+ if (tsFilterExpr.getChildren().isEmpty() ||
+ (tsFilterExpr.getChildren().size() == 1 && !(tsFilterExpr.getChildren().get(0) instanceof ExprNodeGenericFuncDesc))) {
+ tsOp.getConf().setFilterExpr(null);
+ }
+ }
+ }
+ // Filter operators on top
+ if (tsOp.getChildOperators() != null) {
+ for (Operator op : tsOp.getChildOperators()) {
+ if (op instanceof FilterOperator) {
+ FilterOperator filterOp = (FilterOperator) op;
+ ExprNodeDesc filterExpr = filterOp.getConf().getPredicate();
+ if (FunctionRegistry.isOpAnd(filterExpr)) {
+ filterExpr.getChildren().removeIf(SharedWorkOptimizer::isSemijoinExpr);
+ if (filterExpr.getChildren().isEmpty()) {
+ filterOp.getConf().setPredicate(constNode);
+ } else if (filterExpr.getChildren().size() == 1) {
+ filterOp.getConf().setPredicate(filterExpr.getChildren().get(0));
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static boolean isSemijoinExpr(ExprNodeDesc expr) {
+ if (expr instanceof ExprNodeDynamicListDesc) {
+ // DYNAMIC PARTITION PRUNING
+ return true;
+ }
+ if (FunctionRegistry.isOpBetween(expr) &&
+ expr.getChildren().get(2) instanceof ExprNodeDynamicValueDesc) {
+ // BETWEEN in SJ
+ return true;
+ }
+ if (FunctionRegistry.isOpInBloomFilter(expr) &&
+ expr.getChildren().get(1) instanceof ExprNodeDynamicValueDesc) {
+ // IN_BLOOM_FILTER in SJ
+ return true;
+ }
+ return false;
+ }
+
+ private static void splitExpressions(ExprNodeDesc exprNode,
+ List<ExprNodeDesc> allExprNodesExceptSemijoin, List<ExprNodeDesc> semijoinExprNodes) {
+ if (FunctionRegistry.isOpAnd(exprNode)) {
+ for (ExprNodeDesc expr : exprNode.getChildren()) {
+ if (isSemijoinExpr(expr)) {
+ semijoinExprNodes.add(expr);
+ } else {
+ allExprNodesExceptSemijoin.add(expr);
+ }
+ }
+ } else if (isSemijoinExpr(exprNode)) {
+ semijoinExprNodes.add(exprNode);
+ } else {
+ allExprNodesExceptSemijoin.add(exprNode);
+ }
+ }
+
private static void sharedWorkExtendedOptimization(ParseContext pctx, SharedWorkOptimizerCache optimizerCache)
throws SemanticException {
// Gather RS operators that 1) belong to root works, i.e., works containing TS operators,
@@ -1679,7 +1772,7 @@ public class SharedWorkOptimizer extends Transform {
} else {
ExprNodeGenericFuncDesc newPred = ExprNodeGenericFuncDesc.newInstance(
new GenericUDFOPAnd(),
- Arrays.<ExprNodeDesc>asList(tableScanExprNode.clone(), filterExprNode));
+ Arrays.asList(tableScanExprNode.clone(), filterExprNode));
filterOp.getConf().setPredicate(newPred);
}
} else {
diff --git a/ql/src/test/queries/clientpositive/perf/query1b.q b/ql/src/test/queries/clientpositive/perf/query1b.q
new file mode 100644
index 0000000..ef6745f
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/perf/query1b.q
@@ -0,0 +1,37 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.stats.fetch.column.stats=true;
+set hive.auto.convert.join=true;
+set hive.auto.convert.join.noconditionaltask=true;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.partition.pruning.extended=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.bloom.filter.factor=1.0f;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.max.bloom.filter.entries=10000000000;
+
+
+explain
+with customer_total_return as
+(select sr_customer_sk as ctr_customer_sk
+,sr_store_sk as ctr_store_sk
+,sum(SR_FEE) as ctr_total_return
+from store_returns
+,date_dim
+where sr_returned_date_sk = d_date_sk
+and d_year =2000
+group by sr_customer_sk
+,sr_store_sk)
+ select c_customer_id
+from customer_total_return ctr1
+,store
+,customer
+where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+from customer_total_return ctr2
+where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
+and s_store_sk = ctr1.ctr_store_sk
+and s_state = 'NM'
+and ctr1.ctr_customer_sk = c_customer_sk
+order by c_customer_id
+limit 100;
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out
new file mode 100644
index 0000000..126edca
--- /dev/null
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out
@@ -0,0 +1,395 @@
+PREHOOK: query: explain
+with customer_total_return as
+(select sr_customer_sk as ctr_customer_sk
+,sr_store_sk as ctr_store_sk
+,sum(SR_FEE) as ctr_total_return
+from store_returns
+,date_dim
+where sr_returned_date_sk = d_date_sk
+and d_year =2000
+group by sr_customer_sk
+,sr_store_sk)
+ select c_customer_id
+from customer_total_return ctr1
+,store
+,customer
+where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+from customer_total_return ctr2
+where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
+and s_store_sk = ctr1.ctr_store_sk
+and s_state = 'NM'
+and ctr1.ctr_customer_sk = c_customer_sk
+order by c_customer_id
+limit 100
+PREHOOK: type: QUERY
+PREHOOK: Input: default@customer
+PREHOOK: Input: default@date_dim
+PREHOOK: Input: default@store
+PREHOOK: Input: default@store_returns
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain
+with customer_total_return as
+(select sr_customer_sk as ctr_customer_sk
+,sr_store_sk as ctr_store_sk
+,sum(SR_FEE) as ctr_total_return
+from store_returns
+,date_dim
+where sr_returned_date_sk = d_date_sk
+and d_year =2000
+group by sr_customer_sk
+,sr_store_sk)
+ select c_customer_id
+from customer_total_return ctr1
+,store
+,customer
+where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+from customer_total_return ctr2
+where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
+and s_store_sk = ctr1.ctr_store_sk
+and s_state = 'NM'
+and ctr1.ctr_customer_sk = c_customer_sk
+order by c_customer_id
+limit 100
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@customer
+POSTHOOK: Input: default@date_dim
+POSTHOOK: Input: default@store
+POSTHOOK: Input: default@store_returns
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 11 <- Reducer 7 (BROADCAST_EDGE)
+ Map 3 <- Map 10 (BROADCAST_EDGE), Map 9 (BROADCAST_EDGE), Reducer 2 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (BROADCAST_EDGE), Map 3 (SIMPLE_EDGE), Reducer 8 (BROADCAST_EDGE)
+ Reducer 5 <- Map 11 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE)
+ Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
+ Reducer 7 <- Reducer 4 (CUSTOM_SIMPLE_EDGE)
+ Reducer 8 <- Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: store
+ filterExpr: (s_state = 'NM') (type: boolean)
+ Statistics: Num rows: 1704 Data size: 153360 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (s_state = 'NM') (type: boolean)
+ Statistics: Num rows: 35 Data size: 3150 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: s_store_sk (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=35)
+ minReductionHashAggr: 0.9714286
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Execution mode: vectorized
+ Map 10
+ Map Operator Tree:
+ TableScan
+ alias: date_dim
+ filterExpr: (d_year = 2000) (type: boolean)
+ Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (d_year = 2000) (type: boolean)
+ Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: d_date_sk (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized
+ Map 11
+ Map Operator Tree:
+ TableScan
+ alias: customer
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (c_customer_sk BETWEEN DynamicValue(RS_47_store_returns_sr_customer_sk_min) AND DynamicValue(RS_47_store_returns_sr_customer_sk_max) and in_bloom_filter(c_customer_sk, DynamicValue(RS_47_store_returns_sr_customer_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: c_customer_sk (type: int), c_customer_id (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: string)
+ Execution mode: vectorized
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: store_returns
+ filterExpr: (((sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null) or (sr_store_sk is not null and sr_returned_date_sk is not null)) and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 57591150 Data size: 6891360020 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2))
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col1, _col2, _col3
+ input vertices:
+ 1 Map 9
+ Statistics: Num rows: 16855704 Data size: 1805298496 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Group By Operator
+ aggregations: sum(_col3)
+ keys: _col2 (type: int), _col1 (type: int)
+ minReductionHashAggr: 0.8699312
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: int), _col1 (type: int)
+ Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: decimal(17,2))
+ Filter Operator
+ predicate: (sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2))
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col1, _col2, _col3
+ input vertices:
+ 1 Map 10
+ Statistics: Num rows: 17467258 Data size: 1870797840 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Group By Operator
+ aggregations: sum(_col3)
+ keys: _col2 (type: int), _col1 (type: int)
+ minReductionHashAggr: 0.85786486
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: decimal(17,2))
+ Execution mode: vectorized
+ Map 9
+ Map Operator Tree:
+ TableScan
+ alias: date_dim
+ filterExpr: (d_year = 2000) (type: boolean)
+ Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (d_year = 2000) (type: boolean)
+ Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: d_date_sk (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=35)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Reducer 4
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: _col2 is not null (type: boolean)
+ Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col1 (type: int), _col0 (type: int), _col2 (type: decimal(17,2))
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col1, _col2, _col3
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 1923224 Data size: 220816368 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col2 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col1, _col3, _col4
+ input vertices:
+ 1 Reducer 8
+ Statistics: Num rows: 1991910 Data size: 449166736 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Filter Operator
+ predicate: (_col3 > _col4) (type: boolean)
+ Statistics: Num rows: 663970 Data size: 149722248 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 663970 Data size: 149722248 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col1 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 663970 Data size: 1985936 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=68687)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Reducer 5
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col7
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col7 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ Reducer 6
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 100
+ Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 7
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=68687)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Reducer 8
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: int), _col2 (type: decimal(17,2))
+ outputColumnNames: _col1, _col2
+ Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: sum(_col2), count(_col2)
+ keys: _col1 (type: int)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (_col1 is not null and _col2 is not null) (type: boolean)
+ Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: ((_col1 / _col2) * 1.2) (type: decimal(38,11)), _col0 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: decimal(38,11))
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 100
+ Processor Tree:
+ ListSink
+
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out
index 092f790..e8a23e7 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out
@@ -115,7 +115,7 @@ Stage-0
Select Operator [SEL_114] (rows=285116600 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_112] (rows=285116600 width=119)
- predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null)
+ predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
TableScan [TS_0] (rows=287989836 width=119)
default@catalog_sales,catalog_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["cs_sold_date_sk","cs_item_sk","cs_ext_discount_amt"]
<-Reducer 10 [BROADCAST_EDGE] vectorized
@@ -160,7 +160,7 @@ Stage-0
Select Operator [SEL_115] (rows=286549727 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_113] (rows=286549727 width=119)
- predicate:cs_sold_date_sk is not null
+ predicate:(cs_sold_date_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
Please refer to the previous TableScan [TS_0]
<-Map 8 [SIMPLE_EDGE] vectorized
SHUFFLE [RS_121]
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out
index 7452f08..0d44384 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out
@@ -119,7 +119,7 @@ Stage-0
Select Operator [SEL_114] (rows=143930905 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_112] (rows=143930905 width=119)
- predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null)
+ predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
TableScan [TS_0] (rows=144002668 width=119)
default@web_sales,web_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["ws_sold_date_sk","ws_item_sk","ws_ext_discount_amt"]
<-Reducer 10 [BROADCAST_EDGE] vectorized
@@ -164,7 +164,7 @@ Stage-0
Select Operator [SEL_115] (rows=143966864 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_113] (rows=143966864 width=119)
- predicate:ws_sold_date_sk is not null
+ predicate:(ws_sold_date_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
Please refer to the previous TableScan [TS_0]
<-Map 8 [SIMPLE_EDGE] vectorized
SHUFFLE [RS_121]
diff --git a/ql/src/test/results/clientpositive/perf/tez/query1b.q.out b/ql/src/test/results/clientpositive/perf/tez/query1b.q.out
new file mode 100644
index 0000000..c030334
--- /dev/null
+++ b/ql/src/test/results/clientpositive/perf/tez/query1b.q.out
@@ -0,0 +1,397 @@
+PREHOOK: query: explain
+with customer_total_return as
+(select sr_customer_sk as ctr_customer_sk
+,sr_store_sk as ctr_store_sk
+,sum(SR_FEE) as ctr_total_return
+from store_returns
+,date_dim
+where sr_returned_date_sk = d_date_sk
+and d_year =2000
+group by sr_customer_sk
+,sr_store_sk)
+ select c_customer_id
+from customer_total_return ctr1
+,store
+,customer
+where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+from customer_total_return ctr2
+where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
+and s_store_sk = ctr1.ctr_store_sk
+and s_state = 'NM'
+and ctr1.ctr_customer_sk = c_customer_sk
+order by c_customer_id
+limit 100
+PREHOOK: type: QUERY
+PREHOOK: Input: default@customer
+PREHOOK: Input: default@date_dim
+PREHOOK: Input: default@store
+PREHOOK: Input: default@store_returns
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain
+with customer_total_return as
+(select sr_customer_sk as ctr_customer_sk
+,sr_store_sk as ctr_store_sk
+,sum(SR_FEE) as ctr_total_return
+from store_returns
+,date_dim
+where sr_returned_date_sk = d_date_sk
+and d_year =2000
+group by sr_customer_sk
+,sr_store_sk)
+ select c_customer_id
+from customer_total_return ctr1
+,store
+,customer
+where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+from customer_total_return ctr2
+where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
+and s_store_sk = ctr1.ctr_store_sk
+and s_state = 'NM'
+and ctr1.ctr_customer_sk = c_customer_sk
+order by c_customer_id
+limit 100
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@customer
+POSTHOOK: Input: default@date_dim
+POSTHOOK: Input: default@store
+POSTHOOK: Input: default@store_returns
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 10 <- Reducer 7 (BROADCAST_EDGE)
+ Map 3 <- Map 11 (BROADCAST_EDGE), Map 9 (BROADCAST_EDGE), Reducer 2 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+ Reducer 4 <- Map 1 (BROADCAST_EDGE), Map 3 (SIMPLE_EDGE)
+ Reducer 5 <- Map 10 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE), Reducer 8 (BROADCAST_EDGE)
+ Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
+ Reducer 7 <- Reducer 4 (CUSTOM_SIMPLE_EDGE)
+ Reducer 8 <- Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: store
+ filterExpr: ((s_state = 'NM') and s_store_sk is not null) (type: boolean)
+ Statistics: Num rows: 1704 Data size: 153360 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((s_state = 'NM') and s_store_sk is not null) (type: boolean)
+ Statistics: Num rows: 35 Data size: 3150 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: s_store_sk (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 35 Data size: 140 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=31)
+ minReductionHashAggr: 0.9714286
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Execution mode: vectorized
+ Map 10
+ Map Operator Tree:
+ TableScan
+ alias: customer
+ filterExpr: (c_customer_sk is not null and c_customer_sk BETWEEN DynamicValue(RS_44_store_returns_sr_customer_sk_min) AND DynamicValue(RS_44_store_returns_sr_customer_sk_max) and in_bloom_filter(c_customer_sk, DynamicValue(RS_44_store_returns_sr_customer_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (c_customer_sk is not null and c_customer_sk BETWEEN DynamicValue(RS_44_store_returns_sr_customer_sk_min) AND DynamicValue(RS_44_store_returns_sr_customer_sk_max) and in_bloom_filter(c_customer_sk, DynamicValue(RS_44_store_returns_sr_customer_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: c_customer_sk (type: int), c_customer_id (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 80000000 Data size: 8320000000 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: string)
+ Execution mode: vectorized
+ Map 11
+ Map Operator Tree:
+ TableScan
+ alias: date_dim
+ filterExpr: ((d_year = 2000) and d_date_sk is not null) (type: boolean)
+ Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((d_year = 2000) and d_date_sk is not null) (type: boolean)
+ Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: d_date_sk (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: store_returns
+ filterExpr: (((sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null) or (sr_store_sk is not null and sr_returned_date_sk is not null)) and sr_store_sk BETWEEN DynamicValue(RS_41_store_s_store_sk_min) AND DynamicValue(RS_41_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_41_store_s_store_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 57591150 Data size: 6891360020 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_41_store_s_store_sk_min) AND DynamicValue(RS_41_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_41_store_s_store_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2))
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 51757026 Data size: 6193248408 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col1, _col2, _col3
+ input vertices:
+ 1 Map 9
+ Statistics: Num rows: 16855704 Data size: 1805298496 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Group By Operator
+ aggregations: sum(_col3)
+ keys: _col2 (type: int), _col1 (type: int)
+ minReductionHashAggr: 0.8699312
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: int), _col1 (type: int)
+ Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: decimal(17,2))
+ Filter Operator
+ predicate: (sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_41_store_s_store_sk_min) AND DynamicValue(RS_41_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_41_store_s_store_sk_bloom_filter))) (type: boolean)
+ Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: sr_returned_date_sk (type: int), sr_customer_sk (type: int), sr_store_sk (type: int), sr_fee (type: decimal(7,2))
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 53634860 Data size: 6417950124 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col1, _col2, _col3
+ input vertices:
+ 1 Map 11
+ Statistics: Num rows: 17467258 Data size: 1870797840 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Group By Operator
+ aggregations: sum(_col3)
+ keys: _col2 (type: int), _col1 (type: int)
+ minReductionHashAggr: 0.85786486
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 17467258 Data size: 2081058800 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: decimal(17,2))
+ Execution mode: vectorized
+ Map 9
+ Map Operator Tree:
+ TableScan
+ alias: date_dim
+ filterExpr: ((d_year = 2000) and d_date_sk is not null) (type: boolean)
+ Statistics: Num rows: 73049 Data size: 584392 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((d_year = 2000) and d_date_sk is not null) (type: boolean)
+ Statistics: Num rows: 652 Data size: 5216 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: d_date_sk (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 652 Data size: 2608 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=31)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Reducer 4
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: _col2 is not null (type: boolean)
+ Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col1 (type: int), _col0 (type: int), _col2 (type: decimal(17,2))
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 11601100 Data size: 1382161488 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col1, _col2, _col3
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 1923224 Data size: 220816368 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 1923224 Data size: 220816368 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: int), _col3 (type: decimal(17,2))
+ Select Operator
+ expressions: _col1 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1923224 Data size: 5408304 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=68687)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Reducer 5
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col2, _col3, _col5
+ Statistics: Num rows: 1923224 Data size: 410434616 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col2 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col3, _col5, _col6
+ input vertices:
+ 1 Reducer 8
+ Statistics: Num rows: 1991910 Data size: 645378840 Basic stats: COMPLETE Column stats: COMPLETE
+ HybridGraceHashJoin: true
+ Filter Operator
+ predicate: (_col3 > _col6) (type: boolean)
+ Statistics: Num rows: 663970 Data size: 215126280 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col5 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ Reducer 6
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 663970 Data size: 66397000 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 100
+ Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 100 Data size: 10000 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 7
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=68687)
+ mode: final
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+ Reducer 8
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: int), _col2 (type: decimal(17,2))
+ outputColumnNames: _col1, _col2
+ Statistics: Num rows: 13369812 Data size: 1592886816 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: sum(_col2), count(_col2)
+ keys: _col1 (type: int)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (_col1 is not null and _col2 is not null) (type: boolean)
+ Statistics: Num rows: 29 Data size: 3588 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: ((_col1 / _col2) * 1.2) (type: decimal(38,11)), _col0 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 29 Data size: 3356 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: decimal(38,11))
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 100
+ Processor Tree:
+ ListSink
+
diff --git a/ql/src/test/results/clientpositive/perf/tez/query32.q.out b/ql/src/test/results/clientpositive/perf/tez/query32.q.out
index 0b89356d..97beaf5 100644
--- a/ql/src/test/results/clientpositive/perf/tez/query32.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/query32.q.out
@@ -115,7 +115,7 @@ Stage-0
Select Operator [SEL_114] (rows=285116600 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_112] (rows=285116600 width=119)
- predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null and cs_item_sk is not null)
+ predicate:(cs_ext_discount_amt is not null and cs_sold_date_sk is not null and cs_item_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
TableScan [TS_0] (rows=287989836 width=119)
default@catalog_sales,catalog_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["cs_sold_date_sk","cs_item_sk","cs_ext_discount_amt"]
<-Reducer 10 [BROADCAST_EDGE] vectorized
@@ -160,7 +160,7 @@ Stage-0
Select Operator [SEL_115] (rows=286549727 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_113] (rows=286549727 width=119)
- predicate:(cs_sold_date_sk is not null and cs_item_sk is not null)
+ predicate:(cs_sold_date_sk is not null and cs_item_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
Please refer to the previous TableScan [TS_0]
<-Map 8 [SIMPLE_EDGE] vectorized
SHUFFLE [RS_121]
diff --git a/ql/src/test/results/clientpositive/perf/tez/query65.q.out b/ql/src/test/results/clientpositive/perf/tez/query65.q.out
index 69b12f6..3ef64f8 100644
--- a/ql/src/test/results/clientpositive/perf/tez/query65.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/query65.q.out
@@ -197,6 +197,6 @@ Stage-0
Select Operator [SEL_152] (rows=525329897 width=118)
Output:["_col0","_col1","_col2","_col3"]
Filter Operator [FIL_150] (rows=525329897 width=118)
- predicate:(ss_sold_date_sk is not null and ss_store_sk is not null)
+ predicate:(ss_sold_date_sk is not null and ss_store_sk is not null and ss_sold_date_sk BETWEEN DynamicValue(RS_7_date_dim_d_date_sk_min) AND DynamicValue(RS_7_date_dim_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_date_dim_d_date_sk_bloom_filter)))
Please refer to the previous TableScan [TS_0]
diff --git a/ql/src/test/results/clientpositive/perf/tez/query92.q.out b/ql/src/test/results/clientpositive/perf/tez/query92.q.out
index 0a0d54e..edb8961 100644
--- a/ql/src/test/results/clientpositive/perf/tez/query92.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/query92.q.out
@@ -119,7 +119,7 @@ Stage-0
Select Operator [SEL_114] (rows=143930905 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_112] (rows=143930905 width=119)
- predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null and ws_item_sk is not null)
+ predicate:(ws_ext_discount_amt is not null and ws_sold_date_sk is not null and ws_item_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
TableScan [TS_0] (rows=144002668 width=119)
default@web_sales,web_sales,Tbl:COMPLETE,Col:COMPLETE,Output:["ws_sold_date_sk","ws_item_sk","ws_ext_discount_amt"]
<-Reducer 10 [BROADCAST_EDGE] vectorized
@@ -164,7 +164,7 @@ Stage-0
Select Operator [SEL_115] (rows=143966864 width=119)
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_113] (rows=143966864 width=119)
- predicate:(ws_sold_date_sk is not null and ws_item_sk is not null)
+ predicate:(ws_sold_date_sk is not null and ws_item_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter)))
Please refer to the previous TableScan [TS_0]
<-Map 8 [SIMPLE_EDGE] vectorized
SHUFFLE [RS_121]