You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2017/12/13 20:25:09 UTC
hive git commit: HIVE-18241: Query with LEFT SEMI JOIN producing wrong result (Vineet Garg, reviewed by Jesus Camacho Rodriguez)

Repository: hive
Updated Branches:
  refs/heads/master b7be4acc0 -> 8ab523b96


HIVE-18241: Query with LEFT SEMI JOIN producing wrong result (Vineet Garg, reviewed by Jesus Camacho Rodriguez)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/8ab523b9
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/8ab523b9
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/8ab523b9

Branch: refs/heads/master
Commit: 8ab523b9612e81aec255051bcb2b4137419cfa3b
Parents: b7be4ac
Author: Vineet Garg <vg...@apache.com>
Authored: Wed Dec 13 12:23:37 2017 -0800
Committer: Vineet Garg <vg...@apache.com>
Committed: Wed Dec 13 12:25:02 2017 -0800

----------------------------------------------------------------------
 .../rules/HiveRemoveGBYSemiJoinRule.java        |  66 ++++----
 ql/src/test/queries/clientpositive/semijoin.q   |   7 +
 .../results/clientpositive/llap/semijoin.q.out  | 149 +++++++++++++++++++
 .../results/clientpositive/spark/semijoin.q.out | 148 ++++++++++++++++++
 4 files changed, 341 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
index 4e6cce9..4992e70 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
@@ -22,58 +22,66 @@ import org.apache.calcite.rel.RelNode;
 import org.apache.calcite.rel.core.Aggregate;
 import org.apache.calcite.rel.core.JoinInfo;
 import org.apache.calcite.rel.core.JoinRelType;
+import org.apache.calcite.tools.RelBuilder;
 import org.apache.calcite.util.ImmutableBitSet;
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-
 /**
  * Planner rule that removes a {@code Aggregate} from a HiveSemiJoin
- * right input
+ * right input.
  */
 public class HiveRemoveGBYSemiJoinRule extends RelOptRule {
 
   protected static final Logger LOG = LoggerFactory.getLogger(HiveRemoveGBYSemiJoinRule.class);
   public static final HiveRemoveGBYSemiJoinRule INSTANCE =
-      new HiveRemoveGBYSemiJoinRule() ;
+      new HiveRemoveGBYSemiJoinRule();
 
   public HiveRemoveGBYSemiJoinRule() {
     super(
-            operand(HiveSemiJoin.class,
-                some(
-                    operand(RelNode.class, any()),
-                    operand(Aggregate.class, any()))),
+        operand(HiveSemiJoin.class,
+            some(
+                operand(RelNode.class, any()),
+                operand(Aggregate.class, any()))),
         HiveRelFactories.HIVE_BUILDER, "HiveRemoveGBYSemiJoinRule");
   }
 
-    @Override public void onMatch(RelOptRuleCall call) {
-      final HiveSemiJoin semijoin= call.rel(0);
+  @Override public void onMatch(RelOptRuleCall call) {
+    final HiveSemiJoin semijoin= call.rel(0);
+
+    if(semijoin.getJoinType() != JoinRelType.INNER) {
+      return;
+    }
+    final RelNode left = call.rel(1);
+    final Aggregate rightAggregate= call.rel(2);
 
-      if(semijoin.getJoinType() != JoinRelType.INNER) {
-        return;
-      }
-      final RelNode left = call.rel(1);
-      final Aggregate rightAggregate= call.rel(2);
+    // if grouping sets are involved do early return
+    if(rightAggregate.getGroupType() != Aggregate.Group.SIMPLE) {
+      return;
+    }
 
-      // if grouping sets are involved do early return
-      if(rightAggregate.indicator) {
-        return;
-      }
+    if(rightAggregate.indicator) {
+      return;
+    }
 
-      // if there is any aggregate function this group by is not un-necessary
-      if(!rightAggregate.getAggCallList().isEmpty()) {
-        return;
-      }
-      final JoinInfo joinInfo = semijoin.analyzeCondition();
+    // if there is any aggregate function this group by is not un-necessary
+    if(!rightAggregate.getAggCallList().isEmpty()) {
+      return;
+    }
+    final JoinInfo joinInfo = semijoin.analyzeCondition();
 
-      boolean shouldTransform = joinInfo.rightSet().equals(
-          ImmutableBitSet.range(rightAggregate.getGroupCount()));
-      if(shouldTransform) {
-        RelNode newSemiJoin = call.builder().push(left).push(rightAggregate.getInput()).semiJoin(semijoin.getCondition()).build();
-        call.transformTo(newSemiJoin);
-      }
+    boolean shouldTransform = joinInfo.rightSet().equals(
+        ImmutableBitSet.range(rightAggregate.getGroupCount()));
+    if(shouldTransform) {
+      final RelBuilder relBuilder = call.builder();
+      RelNode newRightInput = relBuilder.project(relBuilder.push(rightAggregate.getInput()).
+          fields(rightAggregate.getGroupSet().asList())).build();
+      RelNode newSemiJoin = call.builder().push(left).push(newRightInput)
+          .semiJoin(semijoin.getCondition()).build();
+      call.transformTo(newSemiJoin);
     }
   }
+}
 // End HiveRemoveGBYSemiJoinRule

http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/test/queries/clientpositive/semijoin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/semijoin.q b/ql/src/test/queries/clientpositive/semijoin.q
index c41d222..8f3a6ad 100644
--- a/ql/src/test/queries/clientpositive/semijoin.q
+++ b/ql/src/test/queries/clientpositive/semijoin.q
@@ -77,3 +77,10 @@ select a.key from t3 a left semi join t2 b on a.key = b.key left outer join t1 c
 
 explain select a.key from t3 a left semi join t2 b on a.value = b.value where a.key > 100;
 select a.key from t3 a left semi join t2 b on a.value = b.value where a.key > 100;
+
+explain select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value;
+select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value;

http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/test/results/clientpositive/llap/semijoin.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin.q.out b/ql/src/test/results/clientpositive/llap/semijoin.q.out
index f788ae3..82cee33 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin.q.out
@@ -2745,3 +2745,152 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@t2
 POSTHOOK: Input: default@t3
 #### A masked pattern was here ####
+Warning: Shuffle Join MERGEJOIN[30][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product
+PREHOOK: query: explain select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE)
+        Reducer 3 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE)
+        Reducer 5 <- Map 4 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: outr
+                  Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (key is not null and value is not null) (type: boolean)
+                    Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+                        Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: key is not null (type: boolean)
+                    Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: string), _col1 (type: string)
+            Execution mode: llap
+            LLAP IO: no inputs
+        Map 4 
+            Map Operator Tree:
+                TableScan
+                  alias: src
+                  Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: value is not null (type: boolean)
+                    Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: value (type: string)
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Semi Join 0 to 1
+                keys:
+                  0 _col0 (type: string), _col1 (type: string)
+                  1 _col0 (type: string), _col1 (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col0, _col1, _col2
+                residual filter predicates: {(_col1 > _col2)}
+                Statistics: Num rows: 41666 Data size: 11208154 Basic stats: COMPLETE Column stats: COMPLETE
+                Select Operator
+                  expressions: _col0 (type: string), _col2 (type: string)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 41666 Data size: 7416548 Basic stats: COMPLETE Column stats: COMPLETE
+                  Group By Operator
+                    keys: _col0 (type: string), _col1 (type: string)
+                    mode: hash
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 10609 Data size: 1888402 Basic stats: COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string), _col1 (type: string)
+                      sort order: ++
+                      Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+                      Statistics: Num rows: 10609 Data size: 1888402 Basic stats: COMPLETE Column stats: COMPLETE
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col0 (type: string)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+Warning: Shuffle Join MERGEJOIN[30][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product
+PREHOOK: query: select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####

http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/test/results/clientpositive/spark/semijoin.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/semijoin.q.out b/ql/src/test/results/clientpositive/spark/semijoin.q.out
index 8f5354c..6932efa 100644
--- a/ql/src/test/results/clientpositive/spark/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/spark/semijoin.q.out
@@ -2590,3 +2590,151 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@t2
 POSTHOOK: Input: default@t3
 #### A masked pattern was here ####
+Warning: Shuffle Join JOIN[15][tables = [$hdt$_1, $hdt$_2]] in Work 'Reducer 4' is a cross product
+PREHOOK: query: explain select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Reducer 4 (PARTITION-LEVEL SORT, 2)
+        Reducer 4 <- Map 3 (PARTITION-LEVEL SORT, 1), Reducer 6 (PARTITION-LEVEL SORT, 1)
+        Reducer 6 <- Map 5 (GROUP, 2)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: outr
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: (key is not null and value is not null) (type: boolean)
+                    Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+                        Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: a
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: key is not null (type: boolean)
+                    Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        sort order: 
+                        Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: string), _col1 (type: string)
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: src
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: value is not null (type: boolean)
+                    Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: value (type: string)
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+        Reducer 2 
+            Reduce Operator Tree:
+              Join Operator
+                condition map:
+                     Left Semi Join 0 to 1
+                keys:
+                  0 _col0 (type: string), _col1 (type: string)
+                  1 _col0 (type: string), _col1 (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 45832 Data size: 1019683 Basic stats: COMPLETE Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 45832 Data size: 1019683 Basic stats: COMPLETE Column stats: NONE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Reduce Operator Tree:
+              Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 125000 Data size: 2781000 Basic stats: COMPLETE Column stats: NONE
+                Filter Operator
+                  predicate: (_col1 > _col2) (type: boolean)
+                  Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: _col0 (type: string), _col2 (type: string)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: _col0 (type: string), _col1 (type: string)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: string)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+                        Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+        Reducer 6 
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col0 (type: string)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+Warning: Shuffle Join JOIN[15][tables = [$hdt$_1, $hdt$_2]] in Work 'Reducer 4' is a cross product
+PREHOOK: query: select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select key, value from src outr left semi join
+    (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+    on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####