You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/12/23 04:32:20 UTC

svn commit: r1425398 - in /hive/trunk: build-common.xml ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/test/queries/clientpositive/list_bucket_dml_10.q ql/src/test/results/clientpositive/list_bucket_dml_10.q.out

Author: namit
Date: Sun Dec 23 03:32:19 2012
New Revision: 1425398

URL: http://svn.apache.org/viewvc?rev=1425398&view=rev
Log:
HIVE-3828 insert overwrite fails with stored-as-dir in cluster
(Gang Tim Liu via namit)


Added:
    hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out
Modified:
    hive/trunk/build-common.xml
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java

Modified: hive/trunk/build-common.xml
URL: http://svn.apache.org/viewvc/hive/trunk/build-common.xml?rev=1425398&r1=1425397&r2=1425398&view=diff
==============================================================================
--- hive/trunk/build-common.xml (original)
+++ hive/trunk/build-common.xml Sun Dec 23 03:32:19 2012
@@ -57,7 +57,7 @@
   <property name="test.output" value="true"/>
   <property name="test.junit.output.format" value="xml"/>
   <property name="test.junit.output.usefile" value="true"/>
-  <property name="minimr.query.files" value="input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q"/>
+  <property name="minimr.query.files" value="list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q"/>
   <property name="minimr.query.negative.files" value="cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q" />
   <property name="test.silent" value="true"/>
   <property name="hadoopVersion" value="${hadoop.version.ant-internal}"/>

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java?rev=1425398&r1=1425397&r2=1425398&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java Sun Dec 23 03:32:19 2012
@@ -189,7 +189,8 @@ public class FileSinkOperator extends Te
     private void commit(FileSystem fs) throws HiveException {
       for (int idx = 0; idx < outPaths.length; ++idx) {
         try {
-          if (bDynParts && !fs.exists(finalPaths[idx].getParent())) {
+          if ((bDynParts || isSkewedStoredAsSubDirectories)
+              && !fs.exists(finalPaths[idx].getParent())) {
             fs.mkdirs(finalPaths[idx].getParent());
           }
           if (!fs.rename(outPaths[idx], finalPaths[idx])) {

Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q?rev=1425398&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q Sun Dec 23 03:32:19 2012
@@ -0,0 +1,35 @@
+set hive.mapred.supports.subdirectories=true;
+ 
+set mapred.input.dir.recursive=true;
+
+-- run this test case in minimr to ensure it works in cluster
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+--  5263 000000_0
+--  5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String) 
+    partitioned by (ds String, hr String) 
+    skewed by (key) on ('484','51','103')
+    stored as DIRECTORIES
+    STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08',  hr = '11')
+select key, value from src;
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src;
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');	

Added: hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out?rev=1425398&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out Sun Dec 23 03:32:19 2012
@@ -0,0 +1,345 @@
+PREHOOK: query: -- run this test case in minimr to ensure it works in cluster
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+--  5263 000000_0
+--  5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String) 
+    partitioned by (ds String, hr String) 
+    skewed by (key) on ('484','51','103')
+    stored as DIRECTORIES
+    STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- run this test case in minimr to ensure it works in cluster
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+--  5263 000000_0
+--  5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String) 
+    partitioned by (ds String, hr String) 
+    skewed by (key) on ('484','51','103')
+    stored as DIRECTORIES
+    STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@list_bucketing_static_part
+PREHOOK: query: -- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08',  hr = '11')
+select key, value from src
+PREHOOK: type: QUERY
+POSTHOOK: query: -- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08',  hr = '11')
+select key, value from src
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME list_bucketing_static_part) (TOK_PARTSPEC (TOK_PARTVAL ds '2008-04-08') (TOK_PARTVAL hr '11')))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value)))))
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5
+  Stage-4
+  Stage-0 depends on stages: Stage-4, Stage-3, Stage-6
+  Stage-2 depends on stages: Stage-0
+  Stage-3
+  Stage-5
+  Stage-6 depends on stages: Stage-5
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Alias -> Map Operator Tree:
+        src 
+          TableScan
+            alias: src
+            GatherStats: false
+            Select Operator
+              expressions:
+                    expr: key
+                    type: string
+                    expr: value
+                    type: string
+              outputColumnNames: _col0, _col1
+              File Output Operator
+                compressed: false
+                GlobalTableId: 1
+#### A masked pattern was here ####
+                NumFilesPerFileSink: 1
+                Static Partition Specification: ds=2008-04-08/hr=11/
+#### A masked pattern was here ####
+                table:
+                    input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+                    output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+                    properties:
+                      bucket_count -1
+                      columns key,value
+                      columns.types string:string
+#### A masked pattern was here ####
+                      name default.list_bucketing_static_part
+                      serialization.ddl struct list_bucketing_static_part { string key, string value}
+                      serialization.format 1
+                      serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+                    serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+                    name: default.list_bucketing_static_part
+                TotalFiles: 1
+                GatherStats: true
+                MultiFileSpray: false
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            base file name: src
+            input format: org.apache.hadoop.mapred.TextInputFormat
+            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+            properties:
+              bucket_count -1
+              columns key,value
+              columns.types string:string
+#### A masked pattern was here ####
+              name default.src
+              numFiles 1
+              numPartitions 0
+              numRows 0
+              rawDataSize 0
+              serialization.ddl struct src { string key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              totalSize 5812
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+          
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types string:string
+#### A masked pattern was here ####
+                name default.src
+                numFiles 1
+                numPartitions 0
+                numRows 0
+                rawDataSize 0
+                serialization.ddl struct src { string key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                totalSize 5812
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.src
+            name: default.src
+      Truncated Path -> Alias:
+        /src [src]
+
+  Stage: Stage-7
+    Conditional Operator
+
+  Stage: Stage-4
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          partition:
+            ds 2008-04-08
+            hr 11
+          replace: true
+#### A masked pattern was here ####
+          table:
+              input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+              output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types string:string
+#### A masked pattern was here ####
+                name default.list_bucketing_static_part
+                serialization.ddl struct list_bucketing_static_part { string key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+              name: default.list_bucketing_static_part
+#### A masked pattern was here ####
+
+  Stage: Stage-2
+    Stats-Aggr Operator
+#### A masked pattern was here ####
+
+  Stage: Stage-3
+    Block level merge
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+            output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+            properties:
+              bucket_count -1
+              columns key,value
+              columns.types string:string
+#### A masked pattern was here ####
+              name default.list_bucketing_static_part
+              serialization.ddl struct list_bucketing_static_part { string key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+          
+              input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+              output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types string:string
+#### A masked pattern was here ####
+                name default.list_bucketing_static_part
+                serialization.ddl struct list_bucketing_static_part { string key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+              name: default.list_bucketing_static_part
+            name: default.list_bucketing_static_part
+      Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+  Stage: Stage-5
+    Block level merge
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+            output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+            properties:
+              bucket_count -1
+              columns key,value
+              columns.types string:string
+#### A masked pattern was here ####
+              name default.list_bucketing_static_part
+              serialization.ddl struct list_bucketing_static_part { string key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+          
+              input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+              output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types string:string
+#### A masked pattern was here ####
+                name default.list_bucketing_static_part
+                serialization.ddl struct list_bucketing_static_part { string key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+              name: default.list_bucketing_static_part
+            name: default.list_bucketing_static_part
+      Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+  Stage: Stage-6
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+
+PREHOOK: query: insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@list_bucketing_static_part@ds=2008-04-08/hr=11
+POSTHOOK: query: insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@list_bucketing_static_part@ds=2008-04-08/hr=11
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: -- check DML result
+show partitions list_bucketing_static_part
+PREHOOK: type: SHOWPARTITIONS
+POSTHOOK: query: -- check DML result
+show partitions list_bucketing_static_part
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ds=2008-04-08/hr=11
+PREHOOK: query: desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11')
+PREHOOK: type: DESCTABLE
+POSTHOOK: query: desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+# col_name            	data_type           	comment             
+	 	 
+key                 	string              	None                
+value               	string              	None                
+	 	 
+# Partition Information	 	 
+# col_name            	data_type           	comment             
+	 	 
+ds                  	string              	None                
+hr                  	string              	None                
+	 	 
+# Detailed Partition Information	 	 
+Partition Value:    	[2008-04-08, 11]    	 
+Database:           	default             	 
+Table:              	list_bucketing_static_part	 
+#### A masked pattern was here ####
+Protect Mode:       	None                	 
+#### A masked pattern was here ####
+Partition Parameters:	 	 
+	numFiles            	4                   
+	numRows             	0                   
+	rawDataSize         	0                   
+	totalSize           	5520                
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe	 
+InputFormat:        	org.apache.hadoop.hive.ql.io.RCFileInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.RCFileOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+Stored As SubDirectories:	Yes                 	 
+Skewed Columns:     	[key]               	 
+Skewed Values:      	[[484], [51], [103]]	 
+#### A masked pattern was here ####
+Skewed Value to Truncated Path:	{[484]=/list_bucketing_static_part/ds=2008-04-08/hr=11/key=484, [103]=/list_bucketing_static_part/ds=2008-04-08/hr=11/key=103, [51]=/list_bucketing_static_part/ds=2008-04-08/hr=11/key=51}	 
+Storage Desc Params:	 	 
+	serialization.format	1