You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2019/03/05 19:10:02 UTC

[hive] branch master updated: HIVE-21340: CBO: Prune non-key columns feeding into a SemiJoin (Vineet Garg, reviewed by Jesus Camacho Rodriguez)

This is an automated email from the ASF dual-hosted git repository.

vgarg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 9dc28db  HIVE-21340: CBO: Prune non-key columns feeding into a SemiJoin (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
9dc28db is described below

commit 9dc28dbbdb500cd090dfa9b8c7a49eca71db8e40
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Tue Mar 5 11:09:30 2019 -0800

    HIVE-21340: CBO: Prune non-key columns feeding into a SemiJoin (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
---
 .../optimizer/calcite/rules/HiveSemiJoinRule.java  | 48 ++++++++++----------
 ql/src/test/queries/clientpositive/semijoin.q      | 13 ++++++
 .../results/clientpositive/llap/semijoin.q.out     | 51 +++++++++++++++++++++-
 .../clientpositive/perf/tez/cbo_query14.q.out      |  6 +--
 .../clientpositive/perf/tez/cbo_query83.q.out      | 12 ++---
 .../perf/tez/constraints/cbo_query23.q.out         |  8 ++--
 .../perf/tez/constraints/cbo_query83.q.out         | 12 ++---
 .../results/clientpositive/spark/semijoin.q.out    | 51 +++++++++++++++++++++-
 8 files changed, 156 insertions(+), 45 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java
index 7ce924a..07ce762 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSemiJoinRule.java
@@ -36,7 +36,6 @@ import org.apache.calcite.rex.RexNode;
 import org.apache.calcite.tools.RelBuilder;
 import org.apache.calcite.tools.RelBuilderFactory;
 import org.apache.calcite.util.ImmutableBitSet;
-import org.apache.calcite.util.ImmutableIntList;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -72,6 +71,26 @@ public abstract class HiveSemiJoinRule extends RelOptRule {
     super(operand, relBuilder, null);
   }
 
+  private RelNode buildProject(final Aggregate aggregate, RexBuilder rexBuilder, RelBuilder relBuilder) {
+    assert(!aggregate.indicator && aggregate.getAggCallList().isEmpty());
+    RelNode input = aggregate.getInput();
+    List<Integer> groupingKeys = aggregate.getGroupSet().asList();
+    List<RexNode> projects = new ArrayList<>();
+    for(Integer keys:groupingKeys) {
+      projects.add(rexBuilder.makeInputRef(input, keys.intValue()));
+    }
+    return relBuilder.push(aggregate.getInput()).project(projects).build();
+  }
+
+  private boolean needProject(final RelNode input, final RelNode aggregate) {
+    if((input instanceof HepRelVertex
+        && ((HepRelVertex)input).getCurrentRel() instanceof  Join)
+        || input.getRowType().getFieldCount() != aggregate.getRowType().getFieldCount()) {
+      return true;
+    }
+    return false;
+  }
+
   protected void perform(RelOptRuleCall call, ImmutableBitSet topRefs,
                          RelNode topOperator, Join join, RelNode left, Aggregate aggregate) {
     LOG.debug("Matched HiveSemiJoinRule");
@@ -107,29 +126,14 @@ public abstract class HiveSemiJoinRule extends RelOptRule {
     for (int key : joinInfo.rightKeys) {
       newRightKeyBuilder.add(aggregateKeys.get(key));
     }
-    final ImmutableIntList newRightKeys =
-        ImmutableIntList.copyOf(newRightKeyBuilder);
-    final RelNode newRight = aggregate.getInput();
+    RelNode input = aggregate.getInput();
+    final RelNode newRight = needProject(input, aggregate) ?
+        buildProject(aggregate, rexBuilder, call.builder()) : input;
     final RexNode newCondition =
         RelOptUtil.createEquiJoinCondition(left, joinInfo.leftKeys, newRight,
-                                           newRightKeys, rexBuilder);
-
-    RelNode semi = null;
-    //HIVE-15458: we need to add a Project on top of Join since SemiJoin with Join as it's right input
-    // is not expected further down the pipeline. see jira for more details
-    if(aggregate.getInput() instanceof HepRelVertex
-        && ((HepRelVertex)aggregate.getInput()).getCurrentRel() instanceof  Join) {
-      Join rightJoin = (Join)(((HepRelVertex)aggregate.getInput()).getCurrentRel());
-      List<RexNode> projects = new ArrayList<>();
-      for(int i=0; i<rightJoin.getRowType().getFieldCount(); i++){
-        projects.add(rexBuilder.makeInputRef(rightJoin, i));
-      }
-      RelNode topProject =  call.builder().push(rightJoin).project(projects, rightJoin.getRowType().getFieldNames(),
-                                                                   true).build();
-      semi = call.builder().push(left).push(topProject).semiJoin(newCondition).build();
-    } else {
-      semi = call.builder().push(left).push(aggregate.getInput()).semiJoin(newCondition).build();
-    }
+                                           joinInfo.rightKeys, rexBuilder);
+
+    RelNode semi = call.builder().push(left).push(newRight).semiJoin(newCondition).build();
     call.transformTo(topOperator.copy(topOperator.getTraitSet(), ImmutableList.of(semi)));
   }
 
diff --git a/ql/src/test/queries/clientpositive/semijoin.q b/ql/src/test/queries/clientpositive/semijoin.q
index e1d31cc..692c366 100644
--- a/ql/src/test/queries/clientpositive/semijoin.q
+++ b/ql/src/test/queries/clientpositive/semijoin.q
@@ -90,3 +90,16 @@ select key, value from src outr left semi join
 
 explain cbo select pp.p_partkey from (select distinct p_name from part) p join part pp on pp.p_name = p.p_name;
 select pp.p_partkey from (select distinct p_name from part) p join part pp on pp.p_name = p.p_name;
+
+explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss);
+
+with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss);
diff --git a/ql/src/test/results/clientpositive/llap/semijoin.q.out b/ql/src/test/results/clientpositive/llap/semijoin.q.out
index a1e4788..22b867a 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin.q.out
@@ -3260,11 +3260,11 @@ POSTHOOK: Input: default@part
 #### A masked pattern was here ####
 CBO PLAN:
 HiveProject(p_partkey=[$0])
-  HiveSemiJoin(condition=[=($1, $3)], joinType=[inner])
+  HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
     HiveProject(p_partkey=[$0], p_name=[$1])
       HiveFilter(condition=[IS NOT NULL($1)])
         HiveTableScan(table=[[default, part]], table:alias=[pp])
-    HiveProject(p_partkey=[$0], p_name=[$1], p_mfgr=[$2], p_brand=[$3], p_type=[$4], p_size=[$5], p_container=[$6], p_retailprice=[$7], p_comment=[$8], BLOCK__OFFSET__INSIDE__FILE=[$9], INPUT__FILE__NAME=[$10], ROW__ID=[$11])
+    HiveProject(p_name=[$1])
       HiveFilter(condition=[IS NOT NULL($1)])
         HiveTableScan(table=[[default, part]], table:alias=[part])
 
@@ -3302,3 +3302,50 @@ POSTHOOK: Input: default@part
 85768
 86428
 90681
+PREHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+  HiveSemiJoin(condition=[=($0, $1)], joinType=[inner])
+    HiveProject(p_partkey=[$0])
+      HiveFilter(condition=[IS NOT NULL($0)])
+        HiveTableScan(table=[[default, part]], table:alias=[pp])
+    HiveProject(p_partkey=[$0])
+      HiveFilter(condition=[>($2, 1)])
+        HiveAggregate(group=[{0, 1}], agg#0=[count()])
+          HiveFilter(condition=[IS NOT NULL($0)])
+            HiveTableScan(table=[[default, part]], table:alias=[part])
+
+PREHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+2
diff --git a/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out b/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out
index c09da86..c85f42c 100644
--- a/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/cbo_query14.q.out
@@ -242,7 +242,7 @@ HiveSortLimit(sort0=[$0], sort1=[$1], sort2=[$2], sort3=[$3], dir0=[ASC], dir1=[
                           HiveProject(d_date_sk=[$0])
                             HiveFilter(condition=[AND(=($6, 2000), =($8, 11), IS NOT NULL($0))])
                               HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                      HiveProject(i_item_sk=[$0], i_brand_id=[$1], i_class_id=[$2], i_category_id=[$3], $f0=[$4], $f1=[$5], $f2=[$6])
+                      HiveProject(i_item_sk=[$0])
                         HiveJoin(condition=[AND(AND(=($1, $4), =($2, $5)), =($3, $6))], joinType=[inner], algorithm=[none], cost=[not available])
                           HiveProject(i_item_sk=[$0], i_brand_id=[$7], i_class_id=[$9], i_category_id=[$11])
                             HiveFilter(condition=[AND(IS NOT NULL($7), IS NOT NULL($9), IS NOT NULL($11), IS NOT NULL($0))])
@@ -336,7 +336,7 @@ HiveSortLimit(sort0=[$0], sort1=[$1], sort2=[$2], sort3=[$3], dir0=[ASC], dir1=[
                           HiveProject(d_date_sk=[$0])
                             HiveFilter(condition=[AND(=($6, 2000), =($8, 11), IS NOT NULL($0))])
                               HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                      HiveProject(i_item_sk=[$0], i_brand_id=[$1], i_class_id=[$2], i_category_id=[$3], $f0=[$4], $f1=[$5], $f2=[$6])
+                      HiveProject(i_item_sk=[$0])
                         HiveJoin(condition=[AND(AND(=($1, $4), =($2, $5)), =($3, $6))], joinType=[inner], algorithm=[none], cost=[not available])
                           HiveProject(i_item_sk=[$0], i_brand_id=[$7], i_class_id=[$9], i_category_id=[$11])
                             HiveFilter(condition=[AND(IS NOT NULL($7), IS NOT NULL($9), IS NOT NULL($11), IS NOT NULL($0))])
@@ -430,7 +430,7 @@ HiveSortLimit(sort0=[$0], sort1=[$1], sort2=[$2], sort3=[$3], dir0=[ASC], dir1=[
                           HiveProject(d_date_sk=[$0])
                             HiveFilter(condition=[AND(=($6, 2000), =($8, 11), IS NOT NULL($0))])
                               HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                      HiveProject(i_item_sk=[$0], i_brand_id=[$1], i_class_id=[$2], i_category_id=[$3], $f0=[$4], $f1=[$5], $f2=[$6])
+                      HiveProject(i_item_sk=[$0])
                         HiveJoin(condition=[AND(AND(=($1, $4), =($2, $5)), =($3, $6))], joinType=[inner], algorithm=[none], cost=[not available])
                           HiveProject(i_item_sk=[$0], i_brand_id=[$7], i_class_id=[$9], i_category_id=[$11])
                             HiveFilter(condition=[AND(IS NOT NULL($7), IS NOT NULL($9), IS NOT NULL($11), IS NOT NULL($0))])
diff --git a/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out b/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out
index 0e61e45..8211db1 100644
--- a/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/cbo_query83.q.out
@@ -164,11 +164,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
                       HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
                   HiveProject(d_date=[$0])
                     HiveAggregate(group=[{0}])
-                      HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+                      HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
                         HiveProject(d_date=[$2], d_week_seq=[$4])
                           HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                        HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+                        HiveProject(d_week_seq=[$4])
                           HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
         HiveProject($f0=[$0], $f1=[$1], CAST=[CAST($1):DOUBLE])
@@ -188,11 +188,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
                       HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
                   HiveProject(d_date=[$0])
                     HiveAggregate(group=[{0}])
-                      HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+                      HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
                         HiveProject(d_date=[$2], d_week_seq=[$4])
                           HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                        HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+                        HiveProject(d_week_seq=[$4])
                           HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
       HiveProject($f0=[$0], $f1=[$1], CAST=[CAST($1):DOUBLE])
@@ -212,11 +212,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
                     HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
                 HiveProject(d_date=[$0])
                   HiveAggregate(group=[{0}])
-                    HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+                    HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
                       HiveProject(d_date=[$2], d_week_seq=[$4])
                         HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                           HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                      HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current_m [...]
+                      HiveProject(d_week_seq=[$4])
                         HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
                           HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
 
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out
index e5b0d19..7e4644a 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query23.q.out
@@ -125,7 +125,7 @@ HiveSortLimit(fetch=[100])
       HiveProject($f0=[$0])
         HiveUnion(all=[true])
           HiveProject($f0=[*(CAST($4):DECIMAL(10, 0), $5)])
-            HiveSemiJoin(condition=[=($3, $8)], joinType=[inner])
+            HiveSemiJoin(condition=[=($3, $7)], joinType=[inner])
               HiveJoin(condition=[=($2, $0)], joinType=[inner], algorithm=[none], cost=[not available])
                 HiveProject($f0=[$0])
                   HiveJoin(condition=[>($1, *(0.95, $3))], joinType=[inner], algorithm=[none], cost=[not available])
@@ -169,7 +169,7 @@ HiveSortLimit(fetch=[100])
                   HiveProject(d_date_sk=[$0])
                     HiveFilter(condition=[AND(=($6, 1999), =($8, 1))])
                       HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-              HiveProject(substr=[$0], i_item_sk=[$1], d_date=[$2], $f3=[$3])
+              HiveProject(i_item_sk=[$1])
                 HiveFilter(condition=[>($3, 4)])
                   HiveProject(substr=[$2], i_item_sk=[$1], d_date=[$0], $f3=[$3])
                     HiveAggregate(group=[{3, 4, 5}], agg#0=[count()])
@@ -184,7 +184,7 @@ HiveSortLimit(fetch=[100])
                         HiveProject(i_item_sk=[$0], substr=[substr($4, 1, 30)])
                           HiveTableScan(table=[[default, item]], table:alias=[item])
           HiveProject($f0=[*(CAST($4):DECIMAL(10, 0), $5)])
-            HiveSemiJoin(condition=[=($2, $8)], joinType=[inner])
+            HiveSemiJoin(condition=[=($2, $7)], joinType=[inner])
               HiveJoin(condition=[=($3, $0)], joinType=[inner], algorithm=[none], cost=[not available])
                 HiveProject($f0=[$0])
                   HiveJoin(condition=[>($1, *(0.95, $3))], joinType=[inner], algorithm=[none], cost=[not available])
@@ -228,7 +228,7 @@ HiveSortLimit(fetch=[100])
                   HiveProject(d_date_sk=[$0])
                     HiveFilter(condition=[AND(=($6, 1999), =($8, 1))])
                       HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-              HiveProject(substr=[$0], i_item_sk=[$1], d_date=[$2], $f3=[$3])
+              HiveProject(i_item_sk=[$1])
                 HiveFilter(condition=[>($3, 4)])
                   HiveProject(substr=[$2], i_item_sk=[$1], d_date=[$0], $f3=[$3])
                     HiveAggregate(group=[{3, 4, 5}], agg#0=[count()])
diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out
index ef313ad..58ed6da 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query83.q.out
@@ -160,11 +160,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
                       HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
                   HiveProject(d_date=[$0])
                     HiveAggregate(group=[{0}])
-                      HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+                      HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
                         HiveProject(d_date=[$2], d_week_seq=[$4])
                           HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                        HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+                        HiveProject(d_week_seq=[$4])
                           HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
               HiveProject(i_item_sk=[$0], i_item_id=[$1])
@@ -182,11 +182,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
                       HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
                   HiveProject(d_date=[$0])
                     HiveAggregate(group=[{0}])
-                      HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+                      HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
                         HiveProject(d_date=[$2], d_week_seq=[$4])
                           HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                        HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current [...]
+                        HiveProject(d_week_seq=[$4])
                           HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
                             HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
               HiveProject(i_item_sk=[$0], i_item_id=[$1])
@@ -204,11 +204,11 @@ HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
                     HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
                 HiveProject(d_date=[$0])
                   HiveAggregate(group=[{0}])
-                    HiveSemiJoin(condition=[=($1, $6)], joinType=[inner])
+                    HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
                       HiveProject(d_date=[$2], d_week_seq=[$4])
                         HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                           HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
-                      HiveProject(d_date_sk=[$0], d_date_id=[$1], d_date=[$2], d_month_seq=[$3], d_week_seq=[$4], d_quarter_seq=[$5], d_year=[$6], d_dow=[$7], d_moy=[$8], d_dom=[$9], d_qoy=[$10], d_fy_year=[$11], d_fy_quarter_seq=[$12], d_fy_week_seq=[$13], d_day_name=[$14], d_quarter_name=[$15], d_holiday=[$16], d_weekend=[$17], d_following_holiday=[$18], d_first_dom=[$19], d_last_dom=[$20], d_same_day_ly=[$21], d_same_day_lq=[$22], d_current_day=[$23], d_current_week=[$24], d_current_m [...]
+                      HiveProject(d_week_seq=[$4])
                         HiveFilter(condition=[AND(IN($2, _UTF-16LE'1998-01-02', _UTF-16LE'1998-10-15', _UTF-16LE'1998-11-10'), IS NOT NULL($4))])
                           HiveTableScan(table=[[default, date_dim]], table:alias=[date_dim])
             HiveProject(i_item_sk=[$0], i_item_id=[$1])
diff --git a/ql/src/test/results/clientpositive/spark/semijoin.q.out b/ql/src/test/results/clientpositive/spark/semijoin.q.out
index 38db76d..8a2b12f 100644
--- a/ql/src/test/results/clientpositive/spark/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/spark/semijoin.q.out
@@ -2931,11 +2931,11 @@ POSTHOOK: Input: default@part
 #### A masked pattern was here ####
 CBO PLAN:
 HiveProject(p_partkey=[$0])
-  HiveSemiJoin(condition=[=($1, $3)], joinType=[inner])
+  HiveSemiJoin(condition=[=($1, $2)], joinType=[inner])
     HiveProject(p_partkey=[$0], p_name=[$1])
       HiveFilter(condition=[IS NOT NULL($1)])
         HiveTableScan(table=[[default, part]], table:alias=[pp])
-    HiveProject(p_partkey=[$0], p_name=[$1], p_mfgr=[$2], p_brand=[$3], p_type=[$4], p_size=[$5], p_container=[$6], p_retailprice=[$7], p_comment=[$8], BLOCK__OFFSET__INSIDE__FILE=[$9], INPUT__FILE__NAME=[$10], ROW__ID=[$11])
+    HiveProject(p_name=[$1])
       HiveFilter(condition=[IS NOT NULL($1)])
         HiveTableScan(table=[[default, part]], table:alias=[part])
 
@@ -2973,3 +2973,50 @@ POSTHOOK: Input: default@part
 85768
 86428
 90681
+PREHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{}], agg#0=[count()])
+  HiveSemiJoin(condition=[=($0, $1)], joinType=[inner])
+    HiveProject(p_partkey=[$0])
+      HiveFilter(condition=[IS NOT NULL($0)])
+        HiveTableScan(table=[[default, part]], table:alias=[pp])
+    HiveProject(p_partkey=[$0])
+      HiveFilter(condition=[>($2, 1)])
+        HiveAggregate(group=[{0, 1}], agg#0=[count()])
+          HiveFilter(condition=[IS NOT NULL($0)])
+            HiveTableScan(table=[[default, part]], table:alias=[part])
+
+PREHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@part
+#### A masked pattern was here ####
+POSTHOOK: query: with ss as
+(select count(1), p_partkey, p_name from
+            part group by p_partkey ,p_name
+            having count(1) > 1)
+select count(1) from part pp where pp.p_partkey IN (select p_partkey from ss)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@part
+#### A masked pattern was here ####
+2