You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/11/10 17:20:23 UTC
svn commit: r1540486 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java
test/queries/clientpositive/correlationoptimizer1.q
test/results/clientpositive/correlationoptimizer1.q.out
Author: hashutosh
Date: Sun Nov 10 16:20:22 2013
New Revision: 1540486
URL: http://svn.apache.org/r1540486
Log:
HIVE-5697 : Correlation Optimizer may generate wrong plans for cases involving outer join (Yin Huai via Ashutosh Chauhan)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java
hive/trunk/ql/src/test/queries/clientpositive/correlationoptimizer1.q
hive/trunk/ql/src/test/results/clientpositive/correlationoptimizer1.q.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java?rev=1540486&r1=1540485&r2=1540486&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/CorrelationOptimizer.java Sun Nov 10 16:20:22 2013
@@ -397,24 +397,31 @@ public class CorrelationOptimizer implem
}
}
if (current instanceof JoinOperator) {
- LinkedHashSet<ReduceSinkOperator> correlatedRsOps =
- new LinkedHashSet<ReduceSinkOperator>();
+ boolean isCorrelated = true;
+ int expectedNumCorrelatedRsops = current.getParentOperators().size();
+ LinkedHashSet<ReduceSinkOperator> correlatedRsops = null;
for (Operator<? extends OperatorDesc> parent : current.getParentOperators()) {
Set<String> tableNames =
pCtx.getOpParseCtx().get(parent).getRowResolver().getTableNames();
for (String tbl : tableNames) {
if (tableNeedToCheck.contains(tbl)) {
- correlatedRsOps.addAll(findCorrelatedReduceSinkOperators(
- current, backtrackedKeyCols, backtrackedPartitionCols, childRSOrder,
- parent, correlation));
+ correlatedRsops = findCorrelatedReduceSinkOperators(current,
+ backtrackedKeyCols, backtrackedPartitionCols,
+ childRSOrder, parent, correlation);
+ if (correlatedRsops.size() != expectedNumCorrelatedRsops) {
+ isCorrelated = false;
+ }
}
}
+ if (!isCorrelated) {
+ break;
+ }
}
// If current is JoinOperaotr, we will stop to traverse the tree
// when any of parent ReduceSinkOperaotr of this JoinOperator is
// not considered as a correlated ReduceSinkOperator.
- if (correlatedRsOps.size() == current.getParentOperators().size()) {
- correlatedReduceSinkOperators.addAll(correlatedRsOps);
+ if (isCorrelated && correlatedRsops != null) {
+ correlatedReduceSinkOperators.addAll(correlatedRsops);
} else {
correlatedReduceSinkOperators.clear();
}
Modified: hive/trunk/ql/src/test/queries/clientpositive/correlationoptimizer1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/correlationoptimizer1.q?rev=1540486&r1=1540485&r2=1540486&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/correlationoptimizer1.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/correlationoptimizer1.q Sun Nov 10 16:20:22 2013
@@ -104,7 +104,7 @@ FROM (SELECT x.key AS key, count(1) AS c
set hive.optimize.correlation=false;
-- If the key of a GroupByOperator is the right table's key in
--- a Left Outer Join, we cannot use a single MR to execute these two
+-- a Left Outer Join, we cannot use a single MR to execute these two
-- operators because those keys with a null value are not grouped.
EXPLAIN
SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt))
@@ -130,6 +130,29 @@ FROM (SELECT y.key AS key, count(1) AS c
GROUP BY y.key) tmp;
set hive.optimize.correlation=false;
+-- If a column of the key of a GroupByOperator is the right table's key in
+-- a Left Outer Join, we cannot use a single MR to execute these two
+-- operators because those keys with a null value are not grouped.
+EXPLAIN
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value;
+
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value;
+
+set hive.optimize.correlation=true;
+EXPLAIN
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value;
+
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value;
+
+set hive.optimize.correlation=false;
-- If the key of a GroupByOperator is the right table's key in
-- a Right Outer Join, these two operators will be executed in
-- the same MR job when Correlation Optimizer is enabled.
Modified: hive/trunk/ql/src/test/results/clientpositive/correlationoptimizer1.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/correlationoptimizer1.q.out?rev=1540486&r1=1540485&r2=1540486&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/correlationoptimizer1.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/correlationoptimizer1.q.out Sun Nov 10 16:20:22 2013
@@ -1269,7 +1269,7 @@ POSTHOOK: Input: default@src1
#### A masked pattern was here ####
652447 47
PREHOOK: query: -- If the key of a GroupByOperator is the right table's key in
--- a Left Outer Join, we cannot use a single MR to execute these two
+-- a Left Outer Join, we cannot use a single MR to execute these two
-- operators because those keys with a null value are not grouped.
EXPLAIN
SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt))
@@ -1278,7 +1278,7 @@ FROM (SELECT y.key AS key, count(1) AS c
GROUP BY y.key) tmp
PREHOOK: type: QUERY
POSTHOOK: query: -- If the key of a GroupByOperator is the right table's key in
--- a Left Outer Join, we cannot use a single MR to execute these two
+-- a Left Outer Join, we cannot use a single MR to execute these two
-- operators because those keys with a null value are not grouped.
EXPLAIN
SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt))
@@ -1654,6 +1654,372 @@ POSTHOOK: Input: default@src
POSTHOOK: Input: default@src1
#### A masked pattern was here ####
652447 47
+PREHOOK: query: -- If a column of the key of a GroupByOperator is the right table's key in
+-- a Left Outer Join, we cannot use a single MR to execute these two
+-- operators because those keys with a null value are not grouped.
+EXPLAIN
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+PREHOOK: type: QUERY
+POSTHOOK: query: -- If a column of the key of a GroupByOperator is the right table's key in
+-- a Left Outer Join, we cannot use a single MR to execute these two
+-- operators because those keys with a null value are not grouped.
+EXPLAIN
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (AND (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)) (= (. (TOK_TABLE_OR_COL x) value) (. (TOK_TABLE_OR_COL y) value))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) value)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) value))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ x
+ TableScan
+ alias: x
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ y
+ TableScan
+ alias: y
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ tag: 1
+ value expressions:
+ expr: value
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col5
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ expr: _col5
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Input: default@src1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Input: default@src1
+#### A masked pattern was here ####
+ NULL 10
+128 NULL 1
+146 val_146 2
+150 val_150 1
+213 val_213 2
+224 NULL 1
+238 val_238 2
+255 val_255 2
+273 val_273 3
+278 val_278 2
+311 val_311 3
+369 NULL 1
+401 val_401 5
+406 val_406 4
+66 val_66 1
+98 val_98 2
+PREHOOK: query: EXPLAIN
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME src1) x) (TOK_TABREF (TOK_TABNAME src) y) (AND (= (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) key)) (= (. (TOK_TABLE_OR_COL x) value) (. (TOK_TABLE_OR_COL y) value))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL x) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL y) value)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (. (TOK_TABLE_OR_COL x) key) (. (TOK_TABLE_OR_COL y) value))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ x
+ TableScan
+ alias: x
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ y
+ TableScan
+ alias: y
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ tag: 1
+ value expressions:
+ expr: value
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col5
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ expr: _col5
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Input: default@src1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT x.key, y.value, count(1) AS cnt
+FROM src1 x LEFT OUTER JOIN src y ON (x.key = y.key AND x.value = y.value)
+GROUP BY x.key, y.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Input: default@src1
+#### A masked pattern was here ####
+ NULL 10
+128 NULL 1
+146 val_146 2
+150 val_150 1
+213 val_213 2
+224 NULL 1
+238 val_238 2
+255 val_255 2
+273 val_273 3
+278 val_278 2
+311 val_311 3
+369 NULL 1
+401 val_401 5
+406 val_406 4
+66 val_66 1
+98 val_98 2
PREHOOK: query: -- If the key of a GroupByOperator is the right table's key in
-- a Right Outer Join, these two operators will be executed in
-- the same MR job when Correlation Optimizer is enabled.