You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/09/05 17:01:35 UTC

svn commit: r1381213 - in /hive/trunk: build-common.xml ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java ql/src/test/queries/clientpositive/bucketmapjoin7.q ql/src/test/results/clientpositive/bucketmapjoin7.q.out

Author: namit
Date: Wed Sep  5 15:01:34 2012
New Revision: 1381213

URL: http://svn.apache.org/viewvc?rev=1381213&view=rev
Log:
HIVE-3429 Bucket map join involving table with more than 1 partition column causes 
FileNotFoundException (Kevin Wilfong via namit)


Added:
    hive/trunk/ql/src/test/queries/clientpositive/bucketmapjoin7.q
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin7.q.out
Modified:
    hive/trunk/build-common.xml
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java

Modified: hive/trunk/build-common.xml
URL: http://svn.apache.org/viewvc/hive/trunk/build-common.xml?rev=1381213&r1=1381212&r2=1381213&view=diff
==============================================================================
--- hive/trunk/build-common.xml (original)
+++ hive/trunk/build-common.xml Wed Sep  5 15:01:34 2012
@@ -57,7 +57,7 @@
   <property name="test.output" value="true"/>
   <property name="test.junit.output.format" value="xml"/>
   <property name="test.junit.output.usefile" value="true"/>
-  <property name="minimr.query.files" value="input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q"/>
+  <property name="minimr.query.files" value="input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q"/>
   <property name="minimr.query.negative.files" value="cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q" />
   <property name="test.silent" value="true"/>
   <property name="hadoopVersion" value="${hadoop.version.ant-internal}"/>

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java?rev=1381213&r1=1381212&r2=1381213&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/BucketMapJoinContext.java Wed Sep  5 15:01:34 2012
@@ -17,9 +17,6 @@
  */
 package org.apache.hadoop.hive.ql.plan;
 
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.ql.exec.BucketMatcher;
-
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -28,6 +25,10 @@ import java.util.List;
 import java.util.Map;
 import java.util.regex.Pattern;
 
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.ql.exec.BucketMatcher;
+
 /**
  * was inner class of MapreLocalWork. context for bucket mapjoin (or smb join)
  */
@@ -199,7 +200,8 @@ public class BucketMapJoinContext implem
     Map<String, String> mapping = inputToPartSpecMapping == null ?
         inputToPartSpecMapping = revert(bigTablePartSpecToFileMapping) : inputToPartSpecMapping;
     String partSpec = mapping.get(inputPath);
-    return partSpec == null || partSpec.isEmpty() ? fileName : "(" + partSpec + ")" + fileName;
+    return partSpec == null || partSpec.isEmpty() ? fileName :
+      "(" + FileUtils.escapePathName(partSpec) + ")" + fileName;
   }
 
   // revert partSpecToFileMapping to inputToPartSpecMapping

Added: hive/trunk/ql/src/test/queries/clientpositive/bucketmapjoin7.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/bucketmapjoin7.q?rev=1381213&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/bucketmapjoin7.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/bucketmapjoin7.q Wed Sep  5 15:01:34 2012
@@ -0,0 +1,24 @@
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+
+CREATE TABLE srcbucket_mapjoin_part_1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING) 
+CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/srcbucket20.txt' INTO TABLE srcbucket_mapjoin_part_1 PARTITION (ds='2008-04-08', hr='0');
+LOAD DATA LOCAL INPATH '../data/files/srcbucket21.txt' INTO TABLE srcbucket_mapjoin_part_1 PARTITION (ds='2008-04-08', hr='0');
+
+CREATE TABLE srcbucket_mapjoin_part_2 (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING) 
+CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/srcbucket20.txt' INTO TABLE srcbucket_mapjoin_part_2 PARTITION (ds='2008-04-08', hr='0');
+LOAD DATA LOCAL INPATH '../data/files/srcbucket21.txt' INTO TABLE srcbucket_mapjoin_part_2 PARTITION (ds='2008-04-08', hr='0');
+
+set hive.optimize.bucketmapjoin=true;
+
+-- Tests that bucket map join works with a table with more than one level of partitioning
+
+EXPLAIN EXTENDED
+SELECT /*+ MAPJOIN(b) */ a.key, b.value
+FROM srcbucket_mapjoin_part_1 a JOIN srcbucket_mapjoin_part_2 b
+ON a.key = b.key AND a.ds = '2008-04-08' AND b.ds = '2008-04-08' LIMIT 1;
+
+SELECT /*+ MAPJOIN(b) */ a.key, b.value
+FROM srcbucket_mapjoin_part_1 a JOIN srcbucket_mapjoin_part_2 b
+ON a.key = b.key AND a.ds = '2008-04-08' AND b.ds = '2008-04-08' LIMIT 1;

Added: hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin7.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin7.q.out?rev=1381213&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin7.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin7.q.out Wed Sep  5 15:01:34 2012
@@ -0,0 +1,218 @@
+PREHOOK: query: CREATE TABLE srcbucket_mapjoin_part_1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING) 
+CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE srcbucket_mapjoin_part_1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING) 
+CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@srcbucket_mapjoin_part_1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket20.txt' INTO TABLE srcbucket_mapjoin_part_1 PARTITION (ds='2008-04-08', hr='0')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@srcbucket_mapjoin_part_1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket20.txt' INTO TABLE srcbucket_mapjoin_part_1 PARTITION (ds='2008-04-08', hr='0')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@srcbucket_mapjoin_part_1
+POSTHOOK: Output: default@srcbucket_mapjoin_part_1@ds=2008-04-08/hr=0
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket21.txt' INTO TABLE srcbucket_mapjoin_part_1 PARTITION (ds='2008-04-08', hr='0')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@srcbucket_mapjoin_part_1@ds=2008-04-08/hr=0
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket21.txt' INTO TABLE srcbucket_mapjoin_part_1 PARTITION (ds='2008-04-08', hr='0')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@srcbucket_mapjoin_part_1@ds=2008-04-08/hr=0
+PREHOOK: query: CREATE TABLE srcbucket_mapjoin_part_2 (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING) 
+CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE srcbucket_mapjoin_part_2 (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING) 
+CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@srcbucket_mapjoin_part_2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket20.txt' INTO TABLE srcbucket_mapjoin_part_2 PARTITION (ds='2008-04-08', hr='0')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@srcbucket_mapjoin_part_2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket20.txt' INTO TABLE srcbucket_mapjoin_part_2 PARTITION (ds='2008-04-08', hr='0')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@srcbucket_mapjoin_part_2
+POSTHOOK: Output: default@srcbucket_mapjoin_part_2@ds=2008-04-08/hr=0
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket21.txt' INTO TABLE srcbucket_mapjoin_part_2 PARTITION (ds='2008-04-08', hr='0')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@srcbucket_mapjoin_part_2@ds=2008-04-08/hr=0
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/srcbucket21.txt' INTO TABLE srcbucket_mapjoin_part_2 PARTITION (ds='2008-04-08', hr='0')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@srcbucket_mapjoin_part_2@ds=2008-04-08/hr=0
+PREHOOK: query: -- Tests that bucket map join works with a table with more than one level of partitioning
+
+EXPLAIN EXTENDED
+SELECT /*+ MAPJOIN(b) */ a.key, b.value
+FROM srcbucket_mapjoin_part_1 a JOIN srcbucket_mapjoin_part_2 b
+ON a.key = b.key AND a.ds = '2008-04-08' AND b.ds = '2008-04-08' LIMIT 1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Tests that bucket map join works with a table with more than one level of partitioning
+
+EXPLAIN EXTENDED
+SELECT /*+ MAPJOIN(b) */ a.key, b.value
+FROM srcbucket_mapjoin_part_1 a JOIN srcbucket_mapjoin_part_2 b
+ON a.key = b.key AND a.ds = '2008-04-08' AND b.ds = '2008-04-08' LIMIT 1
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME srcbucket_mapjoin_part_1) a) (TOK_TABREF (TOK_TABNAME srcbucket_mapjoin_part_2) b) (AND (AND (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) ds) '2008-04-08')) (= (. (TOK_TABLE_OR_COL b) ds) '2008-04-08')))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST b))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value))) (TOK_LIMIT 1)))
+
+STAGE DEPENDENCIES:
+  Stage-3 is a root stage
+  Stage-1 depends on stages: Stage-3
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-3
+    Map Reduce Local Work
+      Alias -> Map Local Tables:
+        b 
+          Fetch Operator
+            limit: -1
+      Alias -> Map Local Operator Tree:
+        b 
+          TableScan
+            alias: b
+            GatherStats: false
+            HashTable Sink Operator
+              condition expressions:
+                0 {key}
+                1 {value}
+              handleSkewJoin: false
+              keys:
+                0 [Column[key]]
+                1 [Column[key]]
+              Position of Big Table: 0
+      Bucket Mapjoin Context:
+          Alias Bucket Base File Name Mapping:
+            b {ds=2008-04-08/hr=0/srcbucket20.txt=[ds=2008-04-08/hr=0/srcbucket20.txt], ds=2008-04-08/hr=0/srcbucket21.txt=[ds=2008-04-08/hr=0/srcbucket21.txt]}
+          Alias Bucket File Name Mapping:
+#### A masked pattern was here ####
+          Alias Bucket Output File Name Mapping:
+#### A masked pattern was here ####
+
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        a 
+          TableScan
+            alias: a
+            GatherStats: false
+            Map Join Operator
+              condition map:
+                   Inner Join 0 to 1
+              condition expressions:
+                0 {key}
+                1 {value}
+              handleSkewJoin: false
+              keys:
+                0 [Column[key]]
+                1 [Column[key]]
+              outputColumnNames: _col0, _col7
+              Position of Big Table: 0
+              Select Operator
+                expressions:
+                      expr: _col0
+                      type: int
+                      expr: _col7
+                      type: string
+                outputColumnNames: _col0, _col7
+                Select Operator
+                  expressions:
+                        expr: _col0
+                        type: int
+                        expr: _col7
+                        type: string
+                  outputColumnNames: _col0, _col1
+                  Limit
+                    File Output Operator
+                      compressed: false
+                      GlobalTableId: 0
+#### A masked pattern was here ####
+                      NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+                      table:
+                          input format: org.apache.hadoop.mapred.TextInputFormat
+                          output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                          properties:
+                            columns _col0,_col1
+                            columns.types int:string
+                            escape.delim \
+                            serialization.format 1
+                      TotalFiles: 1
+                      GatherStats: false
+                      MultiFileSpray: false
+      Local Work:
+        Map Reduce Local Work
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            base file name: hr=0
+            input format: org.apache.hadoop.mapred.TextInputFormat
+            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+            partition values:
+              ds 2008-04-08
+              hr 0
+            properties:
+              bucket_count 2
+              bucket_field_name key
+              columns key,value
+              columns.types int:string
+#### A masked pattern was here ####
+              name default.srcbucket_mapjoin_part_1
+              numFiles 2
+              numPartitions 1
+              numRows 0
+              partition_columns ds/hr
+              rawDataSize 0
+              serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              totalSize 2750
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+          
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                bucket_count 2
+                bucket_field_name key
+                columns key,value
+                columns.types int:string
+#### A masked pattern was here ####
+                name default.srcbucket_mapjoin_part_1
+                numFiles 2
+                numPartitions 1
+                numRows 0
+                partition_columns ds/hr
+                rawDataSize 0
+                serialization.ddl struct srcbucket_mapjoin_part_1 { i32 key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                totalSize 2750
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.srcbucket_mapjoin_part_1
+            name: default.srcbucket_mapjoin_part_1
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+
+
+PREHOOK: query: SELECT /*+ MAPJOIN(b) */ a.key, b.value
+FROM srcbucket_mapjoin_part_1 a JOIN srcbucket_mapjoin_part_2 b
+ON a.key = b.key AND a.ds = '2008-04-08' AND b.ds = '2008-04-08' LIMIT 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcbucket_mapjoin_part_1@ds=2008-04-08/hr=0
+PREHOOK: Input: default@srcbucket_mapjoin_part_2@ds=2008-04-08/hr=0
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT /*+ MAPJOIN(b) */ a.key, b.value
+FROM srcbucket_mapjoin_part_1 a JOIN srcbucket_mapjoin_part_2 b
+ON a.key = b.key AND a.ds = '2008-04-08' AND b.ds = '2008-04-08' LIMIT 1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcbucket_mapjoin_part_1@ds=2008-04-08/hr=0
+POSTHOOK: Input: default@srcbucket_mapjoin_part_2@ds=2008-04-08/hr=0
+#### A masked pattern was here ####
+27	val_27