You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2017/12/13 20:25:09 UTC
hive git commit: HIVE-18241: Query with LEFT SEMI JOIN producing
wrong result (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
Repository: hive
Updated Branches:
refs/heads/master b7be4acc0 -> 8ab523b96
HIVE-18241: Query with LEFT SEMI JOIN producing wrong result (Vineet Garg, reviewed by Jesus Camacho Rodriguez)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/8ab523b9
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/8ab523b9
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/8ab523b9
Branch: refs/heads/master
Commit: 8ab523b9612e81aec255051bcb2b4137419cfa3b
Parents: b7be4ac
Author: Vineet Garg <vg...@apache.com>
Authored: Wed Dec 13 12:23:37 2017 -0800
Committer: Vineet Garg <vg...@apache.com>
Committed: Wed Dec 13 12:25:02 2017 -0800
----------------------------------------------------------------------
.../rules/HiveRemoveGBYSemiJoinRule.java | 66 ++++----
ql/src/test/queries/clientpositive/semijoin.q | 7 +
.../results/clientpositive/llap/semijoin.q.out | 149 +++++++++++++++++++
.../results/clientpositive/spark/semijoin.q.out | 148 ++++++++++++++++++
4 files changed, 341 insertions(+), 29 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
index 4e6cce9..4992e70 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java
@@ -22,58 +22,66 @@ import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.Aggregate;
import org.apache.calcite.rel.core.JoinInfo;
import org.apache.calcite.rel.core.JoinRelType;
+import org.apache.calcite.tools.RelBuilder;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
/**
* Planner rule that removes a {@code Aggregate} from a HiveSemiJoin
- * right input
+ * right input.
*/
public class HiveRemoveGBYSemiJoinRule extends RelOptRule {
protected static final Logger LOG = LoggerFactory.getLogger(HiveRemoveGBYSemiJoinRule.class);
public static final HiveRemoveGBYSemiJoinRule INSTANCE =
- new HiveRemoveGBYSemiJoinRule() ;
+ new HiveRemoveGBYSemiJoinRule();
public HiveRemoveGBYSemiJoinRule() {
super(
- operand(HiveSemiJoin.class,
- some(
- operand(RelNode.class, any()),
- operand(Aggregate.class, any()))),
+ operand(HiveSemiJoin.class,
+ some(
+ operand(RelNode.class, any()),
+ operand(Aggregate.class, any()))),
HiveRelFactories.HIVE_BUILDER, "HiveRemoveGBYSemiJoinRule");
}
- @Override public void onMatch(RelOptRuleCall call) {
- final HiveSemiJoin semijoin= call.rel(0);
+ @Override public void onMatch(RelOptRuleCall call) {
+ final HiveSemiJoin semijoin= call.rel(0);
+
+ if(semijoin.getJoinType() != JoinRelType.INNER) {
+ return;
+ }
+ final RelNode left = call.rel(1);
+ final Aggregate rightAggregate= call.rel(2);
- if(semijoin.getJoinType() != JoinRelType.INNER) {
- return;
- }
- final RelNode left = call.rel(1);
- final Aggregate rightAggregate= call.rel(2);
+ // if grouping sets are involved do early return
+ if(rightAggregate.getGroupType() != Aggregate.Group.SIMPLE) {
+ return;
+ }
- // if grouping sets are involved do early return
- if(rightAggregate.indicator) {
- return;
- }
+ if(rightAggregate.indicator) {
+ return;
+ }
- // if there is any aggregate function this group by is not un-necessary
- if(!rightAggregate.getAggCallList().isEmpty()) {
- return;
- }
- final JoinInfo joinInfo = semijoin.analyzeCondition();
+ // if there is any aggregate function this group by is not un-necessary
+ if(!rightAggregate.getAggCallList().isEmpty()) {
+ return;
+ }
+ final JoinInfo joinInfo = semijoin.analyzeCondition();
- boolean shouldTransform = joinInfo.rightSet().equals(
- ImmutableBitSet.range(rightAggregate.getGroupCount()));
- if(shouldTransform) {
- RelNode newSemiJoin = call.builder().push(left).push(rightAggregate.getInput()).semiJoin(semijoin.getCondition()).build();
- call.transformTo(newSemiJoin);
- }
+ boolean shouldTransform = joinInfo.rightSet().equals(
+ ImmutableBitSet.range(rightAggregate.getGroupCount()));
+ if(shouldTransform) {
+ final RelBuilder relBuilder = call.builder();
+ RelNode newRightInput = relBuilder.project(relBuilder.push(rightAggregate.getInput()).
+ fields(rightAggregate.getGroupSet().asList())).build();
+ RelNode newSemiJoin = call.builder().push(left).push(newRightInput)
+ .semiJoin(semijoin.getCondition()).build();
+ call.transformTo(newSemiJoin);
}
}
+}
// End HiveRemoveGBYSemiJoinRule
http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/test/queries/clientpositive/semijoin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/semijoin.q b/ql/src/test/queries/clientpositive/semijoin.q
index c41d222..8f3a6ad 100644
--- a/ql/src/test/queries/clientpositive/semijoin.q
+++ b/ql/src/test/queries/clientpositive/semijoin.q
@@ -77,3 +77,10 @@ select a.key from t3 a left semi join t2 b on a.key = b.key left outer join t1 c
explain select a.key from t3 a left semi join t2 b on a.value = b.value where a.key > 100;
select a.key from t3 a left semi join t2 b on a.value = b.value where a.key > 100;
+
+explain select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value;
+select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value;
http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/test/results/clientpositive/llap/semijoin.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin.q.out b/ql/src/test/results/clientpositive/llap/semijoin.q.out
index f788ae3..82cee33 100644
--- a/ql/src/test/results/clientpositive/llap/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/llap/semijoin.q.out
@@ -2745,3 +2745,152 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@t2
POSTHOOK: Input: default@t3
#### A masked pattern was here ####
+Warning: Shuffle Join MERGEJOIN[30][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product
+PREHOOK: query: explain select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE)
+ Reducer 3 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE)
+ Reducer 5 <- Map 4 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: outr
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: (key is not null and value is not null) (type: boolean)
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type: string)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string), _col1 (type: string)
+ Execution mode: llap
+ LLAP IO: no inputs
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: src
+ Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: value is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 45500 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ keys: value (type: string)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: llap
+ LLAP IO: no inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ keys:
+ 0 _col0 (type: string), _col1 (type: string)
+ 1 _col0 (type: string), _col1 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0
+ 1
+ outputColumnNames: _col0, _col1, _col2
+ residual filter predicates: {(_col1 > _col2)}
+ Statistics: Num rows: 41666 Data size: 11208154 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: string), _col2 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 41666 Data size: 7416548 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ keys: _col0 (type: string), _col1 (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 10609 Data size: 1888402 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type: string)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+ Statistics: Num rows: 10609 Data size: 1888402 Basic stats: COMPLETE Column stats: COMPLETE
+ Reducer 5
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 250 Data size: 22750 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: string)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+Warning: Shuffle Join MERGEJOIN[30][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product
+PREHOOK: query: select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
http://git-wip-us.apache.org/repos/asf/hive/blob/8ab523b9/ql/src/test/results/clientpositive/spark/semijoin.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/semijoin.q.out b/ql/src/test/results/clientpositive/spark/semijoin.q.out
index 8f5354c..6932efa 100644
--- a/ql/src/test/results/clientpositive/spark/semijoin.q.out
+++ b/ql/src/test/results/clientpositive/spark/semijoin.q.out
@@ -2590,3 +2590,151 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@t2
POSTHOOK: Input: default@t3
#### A masked pattern was here ####
+Warning: Shuffle Join JOIN[15][tables = [$hdt$_1, $hdt$_2]] in Work 'Reducer 4' is a cross product
+PREHOOK: query: explain select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Reducer 4 (PARTITION-LEVEL SORT, 2)
+ Reducer 4 <- Map 3 (PARTITION-LEVEL SORT, 1), Reducer 6 (PARTITION-LEVEL SORT, 1)
+ Reducer 6 <- Map 5 (GROUP, 2)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: outr
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (key is not null and value is not null) (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type: string)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string), _col1 (type: string)
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: src
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: value is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: value (type: string)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reducer 2
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ keys:
+ 0 _col0 (type: string), _col1 (type: string)
+ 1 _col0 (type: string), _col1 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 45832 Data size: 1019683 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 45832 Data size: 1019683 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Reducer 4
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0
+ 1
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 125000 Data size: 2781000 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (_col1 > _col2) (type: boolean)
+ Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col2 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ keys: _col0 (type: string), _col1 (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string), _col1 (type: string)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
+ Statistics: Num rows: 41666 Data size: 926985 Basic stats: COMPLETE Column stats: NONE
+ Reducer 6
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string)
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+Warning: Shuffle Join JOIN[15][tables = [$hdt$_1, $hdt$_2]] in Work 'Reducer 4' is a cross product
+PREHOOK: query: select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select key, value from src outr left semi join
+ (select a.key, b.value from src a join (select distinct value from src) b on a.value > b.value group by a.key, b.value) inr
+ on outr.key=inr.key and outr.value=inr.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####