You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/12/23 04:32:20 UTC
svn commit: r1425398 - in /hive/trunk: build-common.xml
ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
ql/src/test/queries/clientpositive/list_bucket_dml_10.q
ql/src/test/results/clientpositive/list_bucket_dml_10.q.out
Author: namit
Date: Sun Dec 23 03:32:19 2012
New Revision: 1425398
URL: http://svn.apache.org/viewvc?rev=1425398&view=rev
Log:
HIVE-3828 insert overwrite fails with stored-as-dir in cluster
(Gang Tim Liu via namit)
Added:
hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q
hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out
Modified:
hive/trunk/build-common.xml
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
Modified: hive/trunk/build-common.xml
URL: http://svn.apache.org/viewvc/hive/trunk/build-common.xml?rev=1425398&r1=1425397&r2=1425398&view=diff
==============================================================================
--- hive/trunk/build-common.xml (original)
+++ hive/trunk/build-common.xml Sun Dec 23 03:32:19 2012
@@ -57,7 +57,7 @@
<property name="test.output" value="true"/>
<property name="test.junit.output.format" value="xml"/>
<property name="test.junit.output.usefile" value="true"/>
- <property name="minimr.query.files" value="input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q"/>
+ <property name="minimr.query.files" value="list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q"/>
<property name="minimr.query.negative.files" value="cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q" />
<property name="test.silent" value="true"/>
<property name="hadoopVersion" value="${hadoop.version.ant-internal}"/>
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java?rev=1425398&r1=1425397&r2=1425398&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java Sun Dec 23 03:32:19 2012
@@ -189,7 +189,8 @@ public class FileSinkOperator extends Te
private void commit(FileSystem fs) throws HiveException {
for (int idx = 0; idx < outPaths.length; ++idx) {
try {
- if (bDynParts && !fs.exists(finalPaths[idx].getParent())) {
+ if ((bDynParts || isSkewedStoredAsSubDirectories)
+ && !fs.exists(finalPaths[idx].getParent())) {
fs.mkdirs(finalPaths[idx].getParent());
}
if (!fs.rename(outPaths[idx], finalPaths[idx])) {
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q?rev=1425398&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_10.q Sun Dec 23 03:32:19 2012
@@ -0,0 +1,35 @@
+set hive.mapred.supports.subdirectories=true;
+
+set mapred.input.dir.recursive=true;
+
+-- run this test case in minimr to ensure it works in cluster
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- 5263 000000_0
+-- 5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key) on ('484','51','103')
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src;
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src;
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
Added: hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out?rev=1425398&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out Sun Dec 23 03:32:19 2012
@@ -0,0 +1,345 @@
+PREHOOK: query: -- run this test case in minimr to ensure it works in cluster
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- 5263 000000_0
+-- 5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key) on ('484','51','103')
+ stored as DIRECTORIES
+ STORED AS RCFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- run this test case in minimr to ensure it works in cluster
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- 5263 000000_0
+-- 5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key) on ('484','51','103')
+ stored as DIRECTORIES
+ STORED AS RCFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@list_bucketing_static_part
+PREHOOK: query: -- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src
+PREHOOK: type: QUERY
+POSTHOOK: query: -- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME list_bucketing_static_part) (TOK_PARTSPEC (TOK_PARTVAL ds '2008-04-08') (TOK_PARTVAL hr '11')))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5
+ Stage-4
+ Stage-0 depends on stages: Stage-4, Stage-3, Stage-6
+ Stage-2 depends on stages: Stage-0
+ Stage-3
+ Stage-5
+ Stage-6 depends on stages: Stage-5
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ src
+ TableScan
+ alias: src
+ GatherStats: false
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: value
+ type: string
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 1
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Static Partition Specification: ds=2008-04-08/hr=11/
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.list_bucketing_static_part
+ serialization.ddl struct list_bucketing_static_part { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.list_bucketing_static_part
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: src
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.src
+ numFiles 1
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.src
+ numFiles 1
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.src
+ name: default.src
+ Truncated Path -> Alias:
+ /src [src]
+
+ Stage: Stage-7
+ Conditional Operator
+
+ Stage: Stage-4
+ Move Operator
+ files:
+ hdfs directory: true
+#### A masked pattern was here ####
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ partition:
+ ds 2008-04-08
+ hr 11
+ replace: true
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.list_bucketing_static_part
+ serialization.ddl struct list_bucketing_static_part { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.list_bucketing_static_part
+#### A masked pattern was here ####
+
+ Stage: Stage-2
+ Stats-Aggr Operator
+#### A masked pattern was here ####
+
+ Stage: Stage-3
+ Block level merge
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.list_bucketing_static_part
+ serialization.ddl struct list_bucketing_static_part { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.list_bucketing_static_part
+ serialization.ddl struct list_bucketing_static_part { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.list_bucketing_static_part
+ name: default.list_bucketing_static_part
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-5
+ Block level merge
+ Needs Tagging: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.list_bucketing_static_part
+ serialization.ddl struct list_bucketing_static_part { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count -1
+ columns key,value
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.list_bucketing_static_part
+ serialization.ddl struct list_bucketing_static_part { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.list_bucketing_static_part
+ name: default.list_bucketing_static_part
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+ Stage: Stage-6
+ Move Operator
+ files:
+ hdfs directory: true
+#### A masked pattern was here ####
+
+
+PREHOOK: query: insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@list_bucketing_static_part@ds=2008-04-08/hr=11
+POSTHOOK: query: insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@list_bucketing_static_part@ds=2008-04-08/hr=11
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: -- check DML result
+show partitions list_bucketing_static_part
+PREHOOK: type: SHOWPARTITIONS
+POSTHOOK: query: -- check DML result
+show partitions list_bucketing_static_part
+POSTHOOK: type: SHOWPARTITIONS
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ds=2008-04-08/hr=11
+PREHOOK: query: desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11')
+PREHOOK: type: DESCTABLE
+POSTHOOK: query: desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11')
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: list_bucketing_static_part PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+# col_name data_type comment
+
+key string None
+value string None
+
+# Partition Information
+# col_name data_type comment
+
+ds string None
+hr string None
+
+# Detailed Partition Information
+Partition Value: [2008-04-08, 11]
+Database: default
+Table: list_bucketing_static_part
+#### A masked pattern was here ####
+Protect Mode: None
+#### A masked pattern was here ####
+Partition Parameters:
+ numFiles 4
+ numRows 0
+ rawDataSize 0
+ totalSize 5520
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Stored As SubDirectories: Yes
+Skewed Columns: [key]
+Skewed Values: [[484], [51], [103]]
+#### A masked pattern was here ####
+Skewed Value to Truncated Path: {[484]=/list_bucketing_static_part/ds=2008-04-08/hr=11/key=484, [103]=/list_bucketing_static_part/ds=2008-04-08/hr=11/key=103, [51]=/list_bucketing_static_part/ds=2008-04-08/hr=11/key=51}
+Storage Desc Params:
+ serialization.format 1