You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2010/06/22 03:17:02 UTC
svn commit: r956756 - in /hadoop/hive/branches/branch-0.6: CHANGES.txt
ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
ql/src/test/queries/clientpositive/sample10.q
ql/src/test/results/clientpositive/sample10.q.out
Author: namit
Date: Tue Jun 22 01:17:02 2010
New Revision: 956756
URL: http://svn.apache.org/viewvc?rev=956756&view=rev
Log:
HIVE-1412. Bug in CombineHiveInputFormat with sampling
(Ning Zhang via namit)
Added:
hadoop/hive/branches/branch-0.6/ql/src/test/queries/clientpositive/sample10.q
hadoop/hive/branches/branch-0.6/ql/src/test/results/clientpositive/sample10.q.out
Modified:
hadoop/hive/branches/branch-0.6/CHANGES.txt
hadoop/hive/branches/branch-0.6/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
Modified: hadoop/hive/branches/branch-0.6/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.6/CHANGES.txt?rev=956756&r1=956755&r2=956756&view=diff
==============================================================================
--- hadoop/hive/branches/branch-0.6/CHANGES.txt (original)
+++ hadoop/hive/branches/branch-0.6/CHANGES.txt Tue Jun 22 01:17:02 2010
@@ -529,6 +529,9 @@ Release 0.6.0 - Unreleased
HIVE-1418. Bug in RCfiles with Lateral Views
(He Yongqiang via namit)
+ HIVE-1412. Bug in CombineHiveInputFormat with sampling
+ (Ning Zhang via namit)
+
HIVE-1421. problem with sequence and rcfiles are mixed for null partitions
(namit via He Yongqiang)
Modified: hadoop/hive/branches/branch-0.6/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.6/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java?rev=956756&r1=956755&r2=956756&view=diff
==============================================================================
--- hadoop/hive/branches/branch-0.6/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (original)
+++ hadoop/hive/branches/branch-0.6/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java Tue Jun 22 01:17:02 2010
@@ -23,9 +23,11 @@ import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
+import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -36,9 +38,9 @@ import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
-import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim;
import org.apache.hadoop.hive.shims.HadoopShims.InputSplitShim;
+import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
@@ -230,8 +232,8 @@ public class CombineHiveInputFormat<K ex
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = combine.getInputPathsShim(job);
+ Set<Path> poolSet = new HashSet<Path>();
for (Path path : paths) {
- LOG.info("CombineHiveInputSplit creating pool for " + path);
PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, path);
TableDesc tableDesc = part.getTableDesc();
@@ -283,7 +285,24 @@ public class CombineHiveInputFormat<K ex
return super.getSplits(job, numSplits);
}
- combine.createPool(job, new CombineFilter(path));
+ // In the case of tablesample, the input paths are pointing to files rather than directories.
+ // We need to get the parent directory as the filtering path so that all files in the same
+ // parent directory will be grouped into one pool but not files from different parent
+ // directories. This guarantees that a split will combine all files in the same partition
+ // but won't cross multiple partitions.
+ Path filterPath = path;
+ if (!path.getFileSystem(job).getFileStatus(path).isDir()) { // path is not directory
+ filterPath = path.getParent();
+ }
+ if (!poolSet.contains(filterPath)) {
+ LOG.info("CombineHiveInputSplit creating pool for " + path +
+ "; using filter path " + filterPath);
+ combine.createPool(job, new CombineFilter(filterPath));
+ poolSet.add(filterPath);
+ } else {
+ LOG.info("CombineHiveInputSplit: pool is already created for " + path +
+ "; using filter path " + filterPath);
+ }
}
InputSplitShim[] iss = combine.getSplits(job, 1);
for (InputSplitShim is : iss) {
@@ -389,10 +408,12 @@ public class CombineHiveInputFormat<K ex
private final String pString;
// store a path prefix in this TestFilter
+ // PRECONDITION: p should always be a directory
public CombineFilter(Path p) {
// we need to keep the path part only because the Hadoop CombineFileInputFormat will
// pass the path part only to accept().
- pString = p.toUri().getPath().toString() + File.separator;
+ // Trailing the path with a separator to prevent partial matching.
+ pString = p.toUri().getPath().toString() + File.separator;;
}
// returns true if the specified path matches the prefix stored
Added: hadoop/hive/branches/branch-0.6/ql/src/test/queries/clientpositive/sample10.q
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.6/ql/src/test/queries/clientpositive/sample10.q?rev=956756&view=auto
==============================================================================
--- hadoop/hive/branches/branch-0.6/ql/src/test/queries/clientpositive/sample10.q (added)
+++ hadoop/hive/branches/branch-0.6/ql/src/test/queries/clientpositive/sample10.q Tue Jun 22 01:17:02 2010
@@ -0,0 +1,24 @@
+
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.enforce.bucketing=true;
+set hive.exec.reducers.max=4;
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+set hive.default.fileformat=RCFILE;
+
+-- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.17, 0.18, 0.19)
+
+create table srcpartbucket (key string, value string) partitioned by (ds string, hr string) clustered by (key) into 4 buckets;
+
+insert overwrite table srcpartbucket partition(ds, hr) select * from srcpart where ds is not null and key < 10;
+
+explain extended
+select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds;
+
+select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds;
+
+select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 2 on key) where ds is not null group by ds;
+
+select * from srcpartbucket where ds is not null;
+
+drop table srcpartbucket;
Added: hadoop/hive/branches/branch-0.6/ql/src/test/results/clientpositive/sample10.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/branches/branch-0.6/ql/src/test/results/clientpositive/sample10.q.out?rev=956756&view=auto
==============================================================================
--- hadoop/hive/branches/branch-0.6/ql/src/test/results/clientpositive/sample10.q.out (added)
+++ hadoop/hive/branches/branch-0.6/ql/src/test/results/clientpositive/sample10.q.out Tue Jun 22 01:17:02 2010
@@ -0,0 +1,441 @@
+PREHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.17, 0.18, 0.19)
+
+create table srcpartbucket (key string, value string) partitioned by (ds string, hr string) clustered by (key) into 4 buckets
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS(0.17, 0.18, 0.19)
+
+create table srcpartbucket (key string, value string) partitioned by (ds string, hr string) clustered by (key) into 4 buckets
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@srcpartbucket
+PREHOOK: query: insert overwrite table srcpartbucket partition(ds, hr) select * from srcpart where ds is not null and key < 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: query: insert overwrite table srcpartbucket partition(ds, hr) select * from srcpart where ds is not null and key < 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpartbucket@ds=2008-04-08/hr=11
+POSTHOOK: Output: default@srcpartbucket@ds=2008-04-08/hr=12
+POSTHOOK: Output: default@srcpartbucket@ds=2008-04-09/hr=11
+POSTHOOK: Output: default@srcpartbucket@ds=2008-04-09/hr=12
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+PREHOOK: query: explain extended
+select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds
+PREHOOK: type: QUERY
+POSTHOOK: query: explain extended
+select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF srcpartbucket (TOK_TABLESAMPLE 1 4 (TOK_TABLE_OR_COL key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL ds)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_WHERE (TOK_FUNCTION TOK_ISNOTNULL (TOK_TABLE_OR_COL ds))) (TOK_GROUPBY (TOK_TABLE_OR_COL ds))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ srcpartbucket
+ TableScan
+ alias: srcpartbucket
+ Filter Operator
+ isSamplingPred: false
+ predicate:
+ expr: ((((hash(key) & 2147483647) % 4) = 0) and ds is not null)
+ type: boolean
+ Filter Operator
+ isSamplingPred: true
+ predicate:
+ expr: (((hash(key) & 2147483647) % 4) = 0)
+ type: boolean
+ Filter Operator
+ isSamplingPred: false
+ predicate:
+ expr: ds is not null
+ type: boolean
+ Select Operator
+ expressions:
+ expr: ds
+ type: string
+ outputColumnNames: ds
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: ds
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Needs Tagging: false
+ Path -> Alias:
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=11/attempt_local_0001_r_000000_0 [srcpartbucket]
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=12/attempt_local_0001_r_000000_0 [srcpartbucket]
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=11/attempt_local_0001_r_000000_0 [srcpartbucket]
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=12/attempt_local_0001_r_000000_0 [srcpartbucket]
+ Path -> Partition:
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=11/attempt_local_0001_r_000000_0
+ Partition
+ base file name: attempt_local_0001_r_000000_0
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ partition values:
+ ds 2008-04-08
+ hr 11
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: srcpartbucket
+ name: srcpartbucket
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-08/hr=12/attempt_local_0001_r_000000_0
+ Partition
+ base file name: attempt_local_0001_r_000000_0
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ partition values:
+ ds 2008-04-08
+ hr 12
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: srcpartbucket
+ name: srcpartbucket
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=11/attempt_local_0001_r_000000_0
+ Partition
+ base file name: attempt_local_0001_r_000000_0
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ partition values:
+ ds 2008-04-09
+ hr 11
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: srcpartbucket
+ name: srcpartbucket
+ file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket/ds=2008-04-09/hr=12/attempt_local_0001_r_000000_0
+ Partition
+ base file name: attempt_local_0001_r_000000_0
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ partition values:
+ ds 2008-04-09
+ hr 12
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ properties:
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.types string:string
+ file.inputformat org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ file.outputformat org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ location file:/data/users/nzhang/work/999/apache-hive/build/ql/test/data/warehouse/srcpartbucket
+ name srcpartbucket
+ partition_columns ds/hr
+ serialization.ddl struct srcpartbucket { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ transient_lastDdlTime 1277145923
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: srcpartbucket
+ name: srcpartbucket
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ directory: file:/data/users/nzhang/work/999/apache-hive/build/ql/scratchdir/hive_2010-06-21_11-45-29_813_4438091765019255104/10001
+ NumFilesPerFileSink: 1
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ serialization.format 1
+ TotalFiles: 1
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12
+PREHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-30_185_5515955290012905688/10000
+POSTHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 4 on key) where ds is not null group by ds
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12
+POSTHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-30_185_5515955290012905688/10000
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+2008-04-08 10
+2008-04-09 10
+PREHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 2 on key) where ds is not null group by ds
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12
+PREHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-35_252_5650802559039809329/10000
+POSTHOOK: query: select ds, count(1) from srcpartbucket tablesample (bucket 1 out of 2 on key) where ds is not null group by ds
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12
+POSTHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-35_252_5650802559039809329/10000
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+2008-04-08 12
+2008-04-09 12
+PREHOOK: query: select * from srcpartbucket where ds is not null
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12
+PREHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-40_176_6924299231993417982/10000
+POSTHOOK: query: select * from srcpartbucket where ds is not null
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-08/hr=12
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpartbucket@ds=2008-04-09/hr=12
+POSTHOOK: Output: file:/tmp/nzhang/hive_2010-06-21_11-45-40_176_6924299231993417982/10000
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+0 val_0 2008-04-08 11
+4 val_4 2008-04-08 11
+8 val_8 2008-04-08 11
+0 val_0 2008-04-08 11
+0 val_0 2008-04-08 11
+5 val_5 2008-04-08 11
+5 val_5 2008-04-08 11
+2 val_2 2008-04-08 11
+5 val_5 2008-04-08 11
+9 val_9 2008-04-08 11
+0 val_0 2008-04-08 12
+4 val_4 2008-04-08 12
+8 val_8 2008-04-08 12
+0 val_0 2008-04-08 12
+0 val_0 2008-04-08 12
+5 val_5 2008-04-08 12
+5 val_5 2008-04-08 12
+2 val_2 2008-04-08 12
+5 val_5 2008-04-08 12
+9 val_9 2008-04-08 12
+0 val_0 2008-04-09 11
+4 val_4 2008-04-09 11
+8 val_8 2008-04-09 11
+0 val_0 2008-04-09 11
+0 val_0 2008-04-09 11
+5 val_5 2008-04-09 11
+5 val_5 2008-04-09 11
+2 val_2 2008-04-09 11
+5 val_5 2008-04-09 11
+9 val_9 2008-04-09 11
+0 val_0 2008-04-09 12
+4 val_4 2008-04-09 12
+8 val_8 2008-04-09 12
+0 val_0 2008-04-09 12
+0 val_0 2008-04-09 12
+5 val_5 2008-04-09 12
+5 val_5 2008-04-09 12
+2 val_2 2008-04-09 12
+5 val_5 2008-04-09 12
+9 val_9 2008-04-09 12
+PREHOOK: query: drop table srcpartbucket
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table srcpartbucket
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Output: default@srcpartbucket
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).key SIMPLE [(srcpart)srcpart.FieldSchema(name:ds, type:string, comment:null), ]
+POSTHOOK: Lineage: srcpartbucket PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:hr, type:string, comment:null), ]