You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2011/07/16 08:42:02 UTC

svn commit: r1147364 - in /hive/trunk: contrib/ eclipse-templates/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientpositive/

Author: namit
Date: Sat Jul 16 06:42:00 2011
New Revision: 1147364

URL: http://svn.apache.org/viewvc?rev=1147364&view=rev
Log:
HIVE-2284 Bucketized map join should allow join key as a superset of
          bucketized columns (Ning Zhang via namit)


Added:
    hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out
Modified:
    hive/trunk/contrib/build.xml
    hive/trunk/eclipse-templates/.classpath
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java

Modified: hive/trunk/contrib/build.xml
URL: http://svn.apache.org/viewvc/hive/trunk/contrib/build.xml?rev=1147364&r1=1147363&r2=1147364&view=diff
==============================================================================
--- hive/trunk/contrib/build.xml (original)
+++ hive/trunk/contrib/build.xml Sat Jul 16 06:42:00 2011
@@ -76,6 +76,7 @@
               logFile="${test.log.dir}/testcontribparsegen.log"
               logDirectory="${test.log.dir}/contribpositive"/>
     
+ <!-- the TestContribParseNegative.java got removed? 
     <qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/ql/parse" 
               templatePath="${ql.test.template.dir}" template="TestParseNegative.vm" 
               queryDirectory="${contrib.test.query.dir}/negative" 
@@ -85,6 +86,7 @@
               resultsDirectory="${contrib.test.results.dir}/compiler/errors" className="TestContribParseNegative"
               logFile="${test.log.dir}/testcontribparseneggen.log"
               logDirectory="${test.log.dir}/contribnegative"/>
+   -->
 
     <qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/cli" 
               templatePath="${ql.test.template.dir}" template="TestCliDriver.vm" 

Modified: hive/trunk/eclipse-templates/.classpath
URL: http://svn.apache.org/viewvc/hive/trunk/eclipse-templates/.classpath?rev=1147364&r1=1147363&r2=1147364&view=diff
==============================================================================
--- hive/trunk/eclipse-templates/.classpath (original)
+++ hive/trunk/eclipse-templates/.classpath Sat Jul 16 06:42:00 2011
@@ -10,7 +10,7 @@
   <classpathentry exported="true" kind="lib" path="cli/lib/jline-@jline.version@.jar"/>
   <classpathentry exported="true" kind="lib" path="lib/json.jar"/>
   <classpathentry exported="true" kind="lib" path="lib/asm-@asm.version@.jar"/>
-  <classpathentry exported="true" kind="lib" path="lib/commons-codec-@commons-codec.version@.jar"/>
+  <classpathentry exported="true" kind="lib" path="build/hadoopcore/hadoop-@HADOOPVER@/lib/commons-codec-@commons-codec.version@.jar"/>
   <classpathentry exported="true" kind="lib" path="lib/commons-lang-@commons-lang.version@.jar"/>
   <classpathentry exported="true" kind="lib" path="lib/commons-logging-@commons-logging.version@.jar"/>
   <classpathentry exported="true" kind="lib" path="lib/commons-logging-api-@commons-logging-api.version@.jar"/>

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java?rev=1147364&r1=1147363&r2=1147364&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/BucketMapJoinOptimizer.java Sat Jul 16 06:42:00 2011
@@ -27,9 +27,9 @@ import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.Stack;
-import java.util.Map.Entry;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -143,6 +143,7 @@ public class BucketMapJoinOptimizer impl
     @Override
     public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
         Object... nodeOutputs) throws SemanticException {
+
       MapJoinOperator mapJoinOp = (MapJoinOperator) nd;
       BucketMapjoinOptProcCtx context = (BucketMapjoinOptProcCtx) procCtx;
 
@@ -256,12 +257,12 @@ public class BucketMapJoinOptimizer impl
             Iterator<Partition> iter = prunedParts.getConfirmedPartns()
                 .iterator();
             if (iter.hasNext()) {
-              part = iter.next();              
+              part = iter.next();
             }
             if (part == null) {
               iter = prunedParts.getUnknownPartns().iterator();
               if (iter.hasNext()) {
-                part = iter.next();              
+                part = iter.next();
               }
             }
             assert part != null;
@@ -467,18 +468,13 @@ public class BucketMapJoinOptimizer impl
         }
       }
 
-      // to see if the join columns from a table is exactly this same as its
-      // bucket columns
-      if (joinCols.size() == 0 || joinCols.size() != bucketColumns.size()) {
+      // Check if the join columns contains all bucket columns.
+      // If a table is bucketized on column B, but the join key is A and B,
+      // it is easy to see joining on different buckets yield empty results.
+      if (joinCols.size() == 0 || !joinCols.containsAll(bucketColumns)) {
         return false;
       }
 
-      for (String col : joinCols) {
-        if (!bucketColumns.contains(col)) {
-          return false;
-        }
-      }
-
       return true;
     }
 

Added: hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q?rev=1147364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/smb_mapjoin_10.q Sat Jul 16 06:42:00 2011
@@ -0,0 +1,18 @@
+
+create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE; 
+
+alter table tmp_smb_bucket_10 add partition (ds = '1');
+alter table tmp_smb_bucket_10 add partition (ds = '2');
+
+set hive.optimize.bucketmapjoin = true;
+set hive.optimize.bucketmapjoin.sortedmerge = true;
+set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+ 
+explain
+select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b 
+on (a.ds = '1' and b.ds = '2' and
+    a.userid = b.userid and
+    a.pageid = b.pageid and
+    a.postid = b.postid and
+    a.type = b.type);
+

Added: hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out?rev=1147364&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_10.q.out Sat Jul 16 06:42:00 2011
@@ -0,0 +1,119 @@
+PREHOOK: query: create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tmp_smb_bucket_10
+PREHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '1')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '1')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=1
+PREHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '2')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: query: alter table tmp_smb_bucket_10 add partition (ds = '2')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Input: default@tmp_smb_bucket_10
+POSTHOOK: Output: default@tmp_smb_bucket_10@ds=2
+PREHOOK: query: explain
+select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b 
+on (a.ds = '1' and b.ds = '2' and
+    a.userid = b.userid and
+    a.pageid = b.pageid and
+    a.postid = b.postid and
+    a.type = b.type)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b 
+on (a.ds = '1' and b.ds = '2' and
+    a.userid = b.userid and
+    a.pageid = b.pageid and
+    a.postid = b.postid and
+    a.type = b.type)
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME tmp_smb_bucket_10) a) (TOK_TABREF (TOK_TABNAME tmp_smb_bucket_10) b) (and (and (and (and (and (= (. (TOK_TABLE_OR_COL a) ds) '1') (= (. (TOK_TABLE_OR_COL b) ds) '2')) (= (. (TOK_TABLE_OR_COL a) userid) (. (TOK_TABLE_OR_COL b) userid))) (= (. (TOK_TABLE_OR_COL a) pageid) (. (TOK_TABLE_OR_COL b) pageid))) (= (. (TOK_TABLE_OR_COL a) postid) (. (TOK_TABLE_OR_COL b) postid))) (= (. (TOK_TABLE_OR_COL a) type) (. (TOK_TABLE_OR_COL b) type))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        b 
+          TableScan
+            alias: b
+            Sorted Merge Bucket Map Join Operator
+              condition map:
+                   Inner Join 0 to 1
+              condition expressions:
+                0 {userid} {pageid} {postid} {type} {ds}
+                1 {userid} {pageid} {postid} {type} {ds}
+              handleSkewJoin: false
+              keys:
+                0 [Column[userid], Column[pageid], Column[postid], Column[type]]
+                1 [Column[userid], Column[pageid], Column[postid], Column[type]]
+              outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col7, _col8, _col9, _col10, _col11
+              Position of Big Table: 1
+              Select Operator
+                expressions:
+                      expr: _col0
+                      type: int
+                      expr: _col1
+                      type: int
+                      expr: _col2
+                      type: int
+                      expr: _col3
+                      type: string
+                      expr: _col4
+                      type: string
+                      expr: _col7
+                      type: int
+                      expr: _col8
+                      type: int
+                      expr: _col9
+                      type: int
+                      expr: _col10
+                      type: string
+                      expr: _col11
+                      type: string
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col7, _col8, _col9, _col10, _col11
+                Select Operator
+                  expressions:
+                        expr: _col0
+                        type: int
+                        expr: _col1
+                        type: int
+                        expr: _col2
+                        type: int
+                        expr: _col3
+                        type: string
+                        expr: _col4
+                        type: string
+                        expr: _col7
+                        type: int
+                        expr: _col8
+                        type: int
+                        expr: _col9
+                        type: int
+                        expr: _col10
+                        type: string
+                        expr: _col11
+                        type: string
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
+                  File Output Operator
+                    compressed: false
+                    GlobalTableId: 0
+                    table:
+                        input format: org.apache.hadoop.mapred.TextInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+