You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2019/03/05 19:10:02 UTC
[hive] branch master updated: HIVE-21340: CBO: Prune non-key
columns feeding into a SemiJoin (Vineet Garg,
reviewed by Jesus Camacho Rodriguez)
This is an automated email from the ASF dual-hosted git repository.
vgarg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 9dc28db HIVE-21340: CBO: Prune non-key columns feeding into a SemiJoin (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
9dc28db is described below
commit 9dc28dbbdb500cd090dfa9b8c7a49eca71db8e40
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Tue Mar 5 11:09:30 2019 -0800
HIVE-21340: CBO: Prune non-key columns feeding into a SemiJoin (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
---
.../optimizer/calcite/rules/HiveSemiJoinRule.java | 48 ++++++++++----------
ql/src/test/queries/clientpositive/semijoin.q | 13 ++++++
.../results/clientpositive/llap/semijoin.q.out | 51 +++++++++++++++++++++-
.../clientpositive/perf/tez/cbo_query14.q.out | 6 +--
.../clientpositive/perf/tez/cbo_query83.q.out | 12 ++---
.../perf/tez/constraints/cbo_query23.q.out | 8 ++--
.../perf/tez/constraints/cbo_query83.q.out | 12 ++---
.../results/clientpositive/spark/semijoin.q.out | 51 +++++++++++++++++++++-
8 files changed, 156 insertions(+), 45 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java
index 7ce924a..07ce762 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java
@@ -36,7 +36,6 @@ import org.apache.calcite.rex.RexNode;
import org.apache.calcite.tools.RelBuilder;
import org.apache.calcite.tools.RelBuilderFactory;
import org.apache.calcite.util.ImmutableBitSet;
-import org.apache.calcite.util.ImmutableIntList;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -72,6 +71,26 @@ public abstract class HiveSemiJoinRule extends RelOptRule {
super(operand, relBuilder, null);
}
+ private RelNode buildProject(final Aggregate aggregate, RexBuilder rexBuilder, RelBuilder relBuilder) {
+ assert(!aggregate.indicator && aggregate.getAggCallList().isEmpty());
+ RelNode input = aggregate.getInput();
+ List<Integer> groupingKeys = aggregate.getGroupSet().asList();
+ List<RexNode> projects = new ArrayList<>();
+ for(Integer keys:groupingKeys) {
+ projects.add(rexBuilder.makeInputRef(input, keys.intValue()));
+ }
+ return relBuilder.push(aggregate.getInput()).project(projects).build();
+ }
+
+ private boolean needProject(final RelNode input, final RelNode aggregate) {
+ if((input instanceof HepRelVertex
+ && ((HepRelVertex)input).getCurrentRel() instanceof Join)
+ || input.getRowType().getFieldCount() != aggregate.getRowType().getFieldCount()) {
+ return true;
+ }
+ return false;
+ }
+
protected void perform(RelOptRuleCall call, ImmutableBitSet topRefs,
RelNode topOperator, Join join, RelNode left, Aggregate aggregate) {
LOG.debug("Matched HiveSemiJoinRule");
@@ -107,29 +126,14 @@ public abstract class HiveSemiJoinRule extends RelOptRule {
for (int key : joinInfo.rightKeys) {
newRightKeyBuilder.add(aggregateKeys.get(key));
}
- final ImmutableIntList newRightKeys =
- ImmutableIntList.copyOf(newRightKeyBuilder);
- final RelNode newRight = aggregate.getInput();
+ RelNode input = aggregate.getInput();
+ final RelNode newRight = needProject(input, aggregate) ?
+ buildProject(aggregate, rexBuilder, call.builder()) : input;
final RexNode newCondition =
RelOptUtil.createEquiJoinCondition(left, joinInfo.leftKeys, newRight,
- newRightKeys, rexBuilder);
-
- RelNode semi = null;
- //HIVE-15458: we need to add a Project on top of Join since SemiJoin with Join as it's right input
- // is not expected further down the pipeline. see jira for more details
- if(aggregate.getInput() instanceof HepRelVertex
- && ((HepRelVertex)aggregate.getInput()).getCurrentRel() instanceof Join) {
- Join rightJoin = (Join)(((HepRelVertex)aggregate.getInput()).getCurrentRel());
- List<RexNode> projects = new ArrayList<>();
- for(int i=0; i<rightJoin.getRowType().getFieldCount(); i++){
- projects.add(rexBuilder.makeInputRef(rightJoin, i));
- }
- RelNode topProject = call.builder().push(rightJoin).project(projects, rightJoin.getRowType().getFieldNames(),
- true).build();
- semi = call.builder().push(left).push(topProject).semiJoin(newCondition).build();
- } else {
- semi = call.builder().push(left).push(aggregate.getInput()).semiJoin(newCondition).build();
- }
+ joinInfo.rightKeys, rexBuilder);
+
+ RelNode semi = call.builder().push(left).push(newRight).semiJoin(newCondition).build();
call.transformTo(topOperator.copy(topOperator.getTraitSet(), ImmutableList.of(semi)));
}
diff --git a/ql/src/test/queries/clientpositive/semijoin.q b/ql/src/test/queries/clientpositive/semijoin.q
index e1d31cc..692c366 100644
--- a/ql/src/test/queries/clientpositive/semijoin.q
+++ b/ql/src/test/queries/clientpositive/semijoin.q
@@ -90,3 +90,16 @@ select key, value from src outr left semi join
explain cbo select pp.p_partkey from (select distinct p_name from part) p join part pp on pp.p_name = p.p_name;
select pp.p_partkey from (select distinct p_name from part) p join part pp on pp.p_name = p.p_name;
+
+explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss);
+
+with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss);
diff --git a/ql/src/test/results/clientpositive/llap/semijoin.q.out b/ql/src/test/results/clientpositive/llap/semijoin.q.out
index a1e4788..22b867a 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin.q.out
@@ -3260,11 +3260,11 @@ POSTHOOK: Input: default@part
#### A masked pattern was here ####
CBO PLAN:
HiveProject(p_partkey=[$0])
- HiveSemiJoin(condition=[=($1, $3)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(p_partkey=[$0], p_name=[$1])
HiveFilter(condition=[IS NOT NULL($1)])
HiveTableScan(table=[[default, part]], table:alias=[pp])
- HiveProject(p_partkey=[$0], p_name=[$1], p_mfgr=[$2], p_brand=[$3], p_type=[$4], p_size=[$5], p_container=[$6], p_retailprice=[$7], p_comment=[$8], BLOCK__OFFSET__INSIDE__FILE=[$9], INPUT__FILE__NAME=[$10], ROW__ID=[$11])
+ HiveProject(p_name=[$1])
HiveFilter(condition=[IS NOT NULL($1)])
HiveTableScan(table=[[default, part]], table:alias=[part])
@@ -3302,3 +3302,50 @@ POSTHOOK: Input: default@part
85768
86428
90681
+PREHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+ HiveSemiJoin(condition=[=($0, $1)], joinType=[inner])
+ HiveProject(p_partkey=[$0])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, part]], table:alias=[pp])
+ HiveProject(p_partkey=[$0])
+ HiveFilter(condition=[>($2, 1)])
+ HiveAggregate(group=[{0, 1}], agg#0=[count()])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, part]], table:alias=[part])
+
+PREHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+2
diff --git a/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out b/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out
index c09da86..c85f42c 100644
--- a/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out
@@ -242,7 +242,7 @@ HiveSortLimit(sort0=[$0], sort1=[$1], sort2=[$2], sort3=[$3], dir0=[ASC], dir1=[
HiveProject(d_date_sk=[$0])
HiveFilter(condition=[AND(=($6, 2000), =($8, 11), IS NOT NULL($0))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(i_item_sk=[$0], i_brand_id=[$1], i_class_id=[$2], i_category_id=[$3], $f0=[$4], $f1=[$5], $f2=[$6])
+ HiveProject(i_item_sk=[$0])
HiveJoin(condition=[AND(AND(=($1, $4), =($2, $5)), =($3, $6))], joinType=[inner], algorithm=[none], cost=[not available])
HiveProject(i_item_sk=[$0], i_brand_id=[$7], i_class_id=[$9], i_category_id=[$11])
HiveFilter(condition=[AND(IS NOT NULL($7), IS NOT NULL($9), IS NOT NULL($11), IS NOT NULL($0))])
@@ -336,7 +336,7 @@ HiveSortLimit(sort0=[$0], sort1=[$1], sort2=[$2], sort3=[$3], dir0=[ASC], dir1=[
HiveProject(d_date_sk=[$0])
HiveFilter(condition=[AND(=($6, 2000), =($8, 11), IS NOT NULL($0))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(i_item_sk=[$0], i_brand_id=[$1], i_class_id=[$2], i_category_id=[$3], $f0=[$4], $f1=[$5], $f2=[$6])
+ HiveProject(i_item_sk=[$0])
HiveJoin(condition=[AND(AND(=($1, $4), =($2, $5)), =($3, $6))], joinType=[inner], algorithm=[none], cost=[not available])
HiveProject(i_item_sk=[$0], i_brand_id=[$7], i_class_id=[$9], i_category_id=[$11])
HiveFilter(condition=[AND(IS NOT NULL($7), IS NOT NULL($9), IS NOT NULL($11), IS NOT NULL($0))])
@@ -430,7 +430,7 @@ HiveSortLimit(sort0=[$0], sort1=[$1], sort2=[$2], sort3=[$3], dir0=[ASC], dir1=[
HiveProject(d_date_sk=[$0])
HiveFilter(condition=[AND(=($6, 2000), =($8, 11), IS NOT NULL($0))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(i_item_sk=[$0], i_brand_id=[$1], i_class_id=[$2], i_category_id=[$3], $f0=[$4], $f1=[$5], $f2=[$6])
+ HiveProject(i_item_sk=[$0])
HiveJoin(condition=[AND(AND(=($1, $4), =($2, $5)), =($3, $6))], joinType=[inner], algorithm=[none], cost=[not available])
HiveProject(i_item_sk=[$0], i_brand_id=[$7], i_class_id=[$9], i_category_id=[$11])
HiveFilter(condition=[AND(IS NOT NULL($7), IS NOT NULL($9), IS NOT NULL($11), IS NOT NULL($0))])
diff --git a/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out b/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out
index 0e61e45..8211db1 100644
--- a/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out
@@ -164,11 +164,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveAggregate(group=[{0}])
- HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+ HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject($f0=[$0], $f1=[$1], CAST=[CAST($1):DOUBLE])
@@ -188,11 +188,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveAggregate(group=[{0}])
- HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+ HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject($f0=[$0], $f1=[$1], CAST=[CAST($1):DOUBLE])
@@ -212,11 +212,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveAggregate(group=[{0}])
- HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current_m [...]
+ HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out
index e5b0d19..7e4644a 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out
@@ -125,7 +125,7 @@ HiveSortLimit(fetch=[100])
HiveProject($f0=[$0])
HiveUnion(all=[true])
HiveProject($f0=[*(CAST($4):DECIMAL(10, 0), $5)])
- HiveSemiJoin(condition=[=($3, $8)], joinType=[inner])
+ HiveSemiJoin(condition=[=($3, $7)], joinType=[inner])
HiveJoin(condition=[=($2, $0)], joinType=[inner], algorithm=[none], cost=[not available])
HiveProject($f0=[$0])
HiveJoin(condition=[>($1, *(0.95, $3))], joinType=[inner], algorithm=[none], cost=[not available])
@@ -169,7 +169,7 @@ HiveSortLimit(fetch=[100])
HiveProject(d_date_sk=[$0])
HiveFilter(condition=[AND(=($6, 1999), =($8, 1))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(substr=[$0], i_item_sk=[$1], d_date=[$2], $f3=[$3])
+ HiveProject(i_item_sk=[$1])
HiveFilter(condition=[>($3, 4)])
HiveProject(substr=[$2], i_item_sk=[$1], d_date=[$0], $f3=[$3])
HiveAggregate(group=[{3, 4, 5}], agg#0=[count()])
@@ -184,7 +184,7 @@ HiveSortLimit(fetch=[100])
HiveProject(i_item_sk=[$0], substr=[substr($4, 1, 30)])
HiveTableScan(table=[[default, item]], table:alias=[item])
HiveProject($f0=[*(CAST($4):DECIMAL(10, 0), $5)])
- HiveSemiJoin(condition=[=($2, $8)], joinType=[inner])
+ HiveSemiJoin(condition=[=($2, $7)], joinType=[inner])
HiveJoin(condition=[=($3, $0)], joinType=[inner], algorithm=[none], cost=[not available])
HiveProject($f0=[$0])
HiveJoin(condition=[>($1, *(0.95, $3))], joinType=[inner], algorithm=[none], cost=[not available])
@@ -228,7 +228,7 @@ HiveSortLimit(fetch=[100])
HiveProject(d_date_sk=[$0])
HiveFilter(condition=[AND(=($6, 1999), =($8, 1))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(substr=[$0], i_item_sk=[$1], d_date=[$2], $f3=[$3])
+ HiveProject(i_item_sk=[$1])
HiveFilter(condition=[>($3, 4)])
HiveProject(substr=[$2], i_item_sk=[$1], d_date=[$0], $f3=[$3])
HiveAggregate(group=[{3, 4, 5}], agg#0=[count()])
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out
index ef313ad..58ed6da 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out
@@ -160,11 +160,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveAggregate(group=[{0}])
- HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+ HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
@@ -182,11 +182,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveAggregate(group=[{0}])
- HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+ HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
@@ -204,11 +204,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveAggregate(group=[{0}])
- HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
- HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current_m [...]
+ HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
diff --git a/ql/src/test/results/clientpositive/spark/semijoin.q.out b/ql/src/test/results/clientpositive/spark/semijoin.q.out
index 38db76d..8a2b12f 100644
--- a/ql/src/test/results/clientpositive/spark/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/spark/semijoin.q.out
@@ -2931,11 +2931,11 @@ POSTHOOK: Input: default@part
#### A masked pattern was here ####
CBO PLAN:
HiveProject(p_partkey=[$0])
- HiveSemiJoin(condition=[=($1, $3)], joinType=[inner])
+ HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
HiveProject(p_partkey=[$0], p_name=[$1])
HiveFilter(condition=[IS NOT NULL($1)])
HiveTableScan(table=[[default, part]], table:alias=[pp])
- HiveProject(p_partkey=[$0], p_name=[$1], p_mfgr=[$2], p_brand=[$3], p_type=[$4], p_size=[$5], p_container=[$6], p_retailprice=[$7], p_comment=[$8], BLOCK__OFFSET__INSIDE__FILE=[$9], INPUT__FILE__NAME=[$10], ROW__ID=[$11])
+ HiveProject(p_name=[$1])
HiveFilter(condition=[IS NOT NULL($1)])
HiveTableScan(table=[[default, part]], table:alias=[part])
@@ -2973,3 +2973,50 @@ POSTHOOK: Input: default@part
85768
86428
90681
+PREHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+ HiveSemiJoin(condition=[=($0, $1)], joinType=[inner])
+ HiveProject(p_partkey=[$0])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, part]], table:alias=[pp])
+ HiveProject(p_partkey=[$0])
+ HiveFilter(condition=[>($2, 1)])
+ HiveAggregate(group=[{0, 1}], agg#0=[count()])
+ HiveFilter(condition=[IS NOT NULL($0)])
+ HiveTableScan(table=[[default, part]], table:alias=[part])
+
+PREHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+ part group by p_partkey ,p_name
+ having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+2