You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2011/07/16 08:42:02 UTC
svn commit: r1147364 - in /hive/trunk: contrib/ eclipse-templates/
ql/src/java/org/apache/hadoop/hive/ql/optimizer/
ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/
Author: namit
Date: Sat Jul 16 06:42:00 2011
New Revision: 1147364
URL: http://svn.apache.org/viewvc?rev=1147364&view=rev
Log:
HIVE-2284 Bucketized map join should allow join key as a superset of
bucketized columns (Ning Zhang via namit)
Added:
hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q
hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out
Modified:
hive/trunk/contrib/build.xml
hive/trunk/eclipse-templates/.classpath
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
Modified: hive/trunk/contrib/build.xml
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/build.xml?rev=1147364&r1=1147363&r2=1147364&view=diff
==============================================================================
--- hive/trunk/contrib/build.xml (original)
+++ hive/trunk/contrib/build.xml Sat Jul 16 06:42:00 2011
@@ -76,6 +76,7 @@
logFile="${test.log.dir}/testcontribparsegen.log"
logDirectory="${test.log.dir}/contribpositive"/>
+ <!-- the TestContribParseNegative.java got removed?
<qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/ql/parse"
templatePath="${ql.test.template.dir}" template="TestParseNegative.vm"
queryDirectory="${contrib.test.query.dir}/negative"
@@ -85,6 +86,7 @@
resultsDirectory="${contrib.test.results.dir}/compiler/errors" className="TestContribParseNegative"
logFile="${test.log.dir}/testcontribparseneggen.log"
logDirectory="${test.log.dir}/contribnegative"/>
+ -->
<qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/cli"
templatePath="${ql.test.template.dir}" template="TestCliDriver.vm"
Modified: hive/trunk/eclipse-templates/.classpath
URL: http://svn.apache.org/viewvc/hive/trunk/eclipse-templates/.classpath?rev=1147364&r1=1147363&r2=1147364&view=diff
==============================================================================
--- hive/trunk/eclipse-templates/.classpath (original)
+++ hive/trunk/eclipse-templates/.classpath Sat Jul 16 06:42:00 2011
@@ -10,7 +10,7 @@
<classpathentry exported="true" kind="lib" path="cli/lib/jline-@jline.version@.jar"/>
<classpathentry exported="true" kind="lib" path="lib/json.jar"/>
<classpathentry exported="true" kind="lib" path="lib/asm-@asm.version@.jar"/>
- <classpathentry exported="true" kind="lib" path="lib/commons-codec-@commons-codec.version@.jar"/>
+ <classpathentry exported="true" kind="lib" path="build/hadoopcore/hadoop-@HADOOPVER@/lib/commons-codec-@commons-codec.version@.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-lang-@commons-lang.version@.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-@commons-logging.version@.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-api-@commons-logging-api.version@.jar"/>
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java?rev=1147364&r1=1147363&r2=1147364&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java Sat Jul 16 06:42:00 2011
@@ -27,9 +27,9 @@ import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;
-import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -143,6 +143,7 @@ public class BucketMapJoinOptimizer impl
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
+
MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
@@ -256,12 +257,12 @@ public class BucketMapJoinOptimizer impl
Iterator<Partition> iter = prunedParts.getConfirmedPartns()
.iterator();
if (iter.hasNext()) {
- part = iter.next();
+ part = iter.next();
}
if (part == null) {
iter = prunedParts.getUnknownPartns().iterator();
if (iter.hasNext()) {
- part = iter.next();
+ part = iter.next();
}
}
assert part != null;
@@ -467,18 +468,13 @@ public class BucketMapJoinOptimizer impl
}
}
- // to see if the join columns from a table is exactly this same as its
- // bucket columns
- if (joinCols.size() == 0 || joinCols.size() != bucketColumns.size()) {
+ // Check if the join columns contains all bucket columns.
+ // If a table is bucketized on column B, but the join key is A and B,
+ // it is easy to see joining on different buckets yield empty results.
+ if (joinCols.size() == 0 || !joinCols.containsAll(bucketColumns)) {
return false;
}
- for (String col : joinCols) {
- if (!bucketColumns.contains(col)) {
- return false;
- }
- }
-
return true;
}
Added: hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q?rev=1147364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q Sat Jul 16 06:42:00 2011
@@ -0,0 +1,18 @@
+
+create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE;
+
+alter table tmp_smb_bucket_10 add partition (ds = '1');
+alter table tmp_smb_bucket_10 add partition (ds = '2');
+
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+
+explain
+select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b
+on (a.ds = '1' and b.ds = '2' and
+ a.userid = b.userid and
+ a.pageid = b.pageid and
+ a.postid = b.postid and
+ a.type = b.type);
+
Added: hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out?rev=1147364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out Sat Jul 16 06:42:00 2011
@@ -0,0 +1,119 @@
+PREHOOK: query: create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tmp_smb_bucket_10
+PREHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '1')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '1')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=1
+PREHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '2')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '2')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=2
+PREHOOK: query: explain
+select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b
+on (a.ds = '1' and b.ds = '2' and
+ a.userid = b.userid and
+ a.pageid = b.pageid and
+ a.postid = b.postid and
+ a.type = b.type)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b
+on (a.ds = '1' and b.ds = '2' and
+ a.userid = b.userid and
+ a.pageid = b.pageid and
+ a.postid = b.postid and
+ a.type = b.type)
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tmp_smb_bucket_10) a) (TOK_TABREF (TOK_TABNAME tmp_smb_bucket_10) b) (and (and (and (and (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '2')) (= (. (TOK_TABLE_OR_COL a) userid) (. (TOK_TABLE_OR_COL b) userid))) (= (. (TOK_TABLE_OR_COL a) pageid) (. (TOK_TABLE_OR_COL b) pageid))) (= (. (TOK_TABLE_OR_COL a) postid) (. (TOK_TABLE_OR_COL b) postid))) (= (. (TOK_TABLE_OR_COL a) type) (. (TOK_TABLE_OR_COL b) type))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ Sorted Merge Bucket Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {userid} {pageid} {postid} {type} {ds}
+ 1 {userid} {pageid} {postid} {type} {ds}
+ handleSkewJoin: false
+ keys:
+ 0 [Column[userid], Column[pageid], Column[postid], Column[type]]
+ 1 [Column[userid], Column[pageid], Column[postid], Column[type]]
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col7, _col8, _col9, _col10, _col11
+ Position of Big Table: 1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: int
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col7
+ type: int
+ expr: _col8
+ type: int
+ expr: _col9
+ type: int
+ expr: _col10
+ type: string
+ expr: _col11
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col7, _col8, _col9, _col10, _col11
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: int
+ expr: _col3
+ type: string
+ expr: _col4
+ type: string
+ expr: _col7
+ type: int
+ expr: _col8
+ type: int
+ expr: _col9
+ type: int
+ expr: _col10
+ type: string
+ expr: _col11
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+