You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/12/19 08:58:42 UTC

svn commit: r1423777 [1/4] - in /hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org/apache/hadoop/hive/ql/parse/ ql/src/java/org/apache/hadoop/hive/ql/plan/ ql/src/test/queries/clientpositive/ ql/src/test/results/clientposi...

Author: namit
Date: Wed Dec 19 07:58:40 2012
New Revision: 1423777

URL: http://svn.apache.org/viewvc?rev=1423777&view=rev
Log:
HIVE-3796 Multi-insert involving bucketed/sorted table turns off merging on all outputs
(Kevin Wilfong via namit)


Added:
    hive/trunk/ql/src/test/queries/clientpositive/bucket5.q
    hive/trunk/ql/src/test/results/clientpositive/bucket5.q.out
Modified:
    hive/trunk/build-common.xml
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_3.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_5.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin9.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_6.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_7.q.out

Modified: hive/trunk/build-common.xml
URL: http://svn.apache.org/viewvc/hive/trunk/build-common.xml?rev=1423777&r1=1423776&r2=1423777&view=diff
==============================================================================
--- hive/trunk/build-common.xml (original)
+++ hive/trunk/build-common.xml Wed Dec 19 07:58:40 2012
@@ -57,7 +57,7 @@
   <property name="test.output" value="true"/>
   <property name="test.junit.output.format" value="xml"/>
   <property name="test.junit.output.usefile" value="true"/>
-  <property name="minimr.query.files" value="input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q"/>
+  <property name="minimr.query.files" value="input16_cc.q,scriptfile1.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q"/>
   <property name="minimr.query.negative.files" value="cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q" />
   <property name="test.silent" value="true"/>
   <property name="hadoopVersion" value="${hadoop.version.ant-internal}"/>

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java?rev=1423777&r1=1423776&r2=1423777&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java Wed Dec 19 07:58:40 2012
@@ -137,7 +137,7 @@ public class GenMRFileSink1 implements N
           addStatsTask(fsOp, mvTask, currTask, parseCtx.getConf());
         }
 
-        if ((mvTask != null) && !mvTask.isLocal()) {
+        if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) {
           if (fsOp.getConf().isLinkedFileSink()) {
             // If the user has HIVEMERGEMAPREDFILES set to false, the idea was the
             // number of reducers are few, so the number of files anyway are small.

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1423777&r1=1423776&r2=1423777&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Wed Dec 19 07:58:40 2012
@@ -4493,9 +4493,6 @@ public class SemanticAnalyzer extends Ba
       ctx.setNumFiles(numFiles);
       ctx.setPartnCols(partnColsNoConvert);
       ctx.setTotalFiles(totalFiles);
-      //disable "merge mapfiles" and "merge mapred files".
-      HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVEMERGEMAPFILES, false);
-      HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVEMERGEMAPREDFILES, false);
     }
     return input;
   }
@@ -4877,12 +4874,21 @@ public class SemanticAnalyzer extends Ba
 
     RowSchema fsRS = new RowSchema(vecCol);
 
+    // The output files of a FileSink can be merged if they are either not being written to a table
+    // or are being written to a table which is either not bucketed or enforce bucketing is not set
+    // and table the table is either not sorted or enforce sorting is not set
+    boolean canBeMerged = (dest_tab == null || !((dest_tab.getNumBuckets() > 0 &&
+        conf.getBoolVar(HiveConf.ConfVars.HIVEENFORCEBUCKETING)) ||
+        (dest_tab.getSortCols() != null && dest_tab.getSortCols().size() > 0 &&
+        conf.getBoolVar(HiveConf.ConfVars.HIVEENFORCESORTING))));
+
     FileSinkDesc fileSinkDesc = new FileSinkDesc(
       queryTmpdir,
       table_desc,
       conf.getBoolVar(HiveConf.ConfVars.COMPRESSRESULT),
       currentTableId,
       rsCtx.isMultiFileSpray(),
+      canBeMerged,
       rsCtx.getNumFiles(),
       rsCtx.getTotalFiles(),
       rsCtx.getPartnCols(),

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java?rev=1423777&r1=1423776&r2=1423777&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java Wed Dec 19 07:58:40 2012
@@ -40,6 +40,9 @@ public class FileSinkDesc extends Abstra
   private String compressCodec;
   private String compressType;
   private boolean multiFileSpray;
+  // Whether the files output by this FileSink can be merged, e.g. if they are to be put into a
+  // bucketed or sorted table/partition they cannot be merged.
+  private boolean canBeMerged;
   private int     totalFiles;
   private ArrayList<ExprNodeDesc> partitionCols;
   private int     numFiles;
@@ -68,14 +71,15 @@ public class FileSinkDesc extends Abstra
 
   public FileSinkDesc(final String dirName, final TableDesc tableInfo,
       final boolean compressed, final int destTableId, final boolean multiFileSpray,
-      final int numFiles, final int totalFiles, final ArrayList<ExprNodeDesc> partitionCols,
-      final DynamicPartitionCtx dpCtx) {
+      final boolean canBeMerged, final int numFiles, final int totalFiles,
+      final ArrayList<ExprNodeDesc> partitionCols, final DynamicPartitionCtx dpCtx) {
 
     this.dirName = dirName;
     this.tableInfo = tableInfo;
     this.compressed = compressed;
     this.destTableId = destTableId;
     this.multiFileSpray = multiFileSpray;
+    this.canBeMerged = canBeMerged;
     this.numFiles = numFiles;
     this.totalFiles = totalFiles;
     this.partitionCols = partitionCols;
@@ -90,6 +94,7 @@ public class FileSinkDesc extends Abstra
     this.compressed = compressed;
     destTableId = 0;
     this.multiFileSpray = false;
+    this.canBeMerged = false;
     this.numFiles = 1;
     this.totalFiles = 1;
     this.partitionCols = null;
@@ -98,7 +103,7 @@ public class FileSinkDesc extends Abstra
   @Override
   public Object clone() throws CloneNotSupportedException {
     FileSinkDesc ret = new FileSinkDesc(dirName, tableInfo, compressed,
-        destTableId, multiFileSpray, numFiles, totalFiles,
+        destTableId, multiFileSpray, canBeMerged, numFiles, totalFiles,
         partitionCols, dpCtx);
     ret.setCompressCodec(compressCodec);
     ret.setCompressType(compressType);
@@ -184,6 +189,14 @@ public class FileSinkDesc extends Abstra
     this.multiFileSpray = multiFileSpray;
   }
 
+  public boolean canBeMerged() {
+    return canBeMerged;
+  }
+
+  public void setCanBeMerged(boolean canBeMerged) {
+    this.canBeMerged = canBeMerged;
+  }
+
   /**
    * @return the totalFiles
    */

Added: hive/trunk/ql/src/test/queries/clientpositive/bucket5.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/bucket5.q?rev=1423777&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/bucket5.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/bucket5.q Wed Dec 19 07:58:40 2012
@@ -0,0 +1,33 @@
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.enforce.bucketing = true;
+set hive.enforce.sorting = true;
+set hive.exec.reducers.max = 1;
+set hive.merge.mapfiles = true;
+set hive.merge.mapredfiles = true;
+set mapred.reduce.tasks = 2;
+
+-- Tests that when a multi insert inserts into a bucketed table and a table which is not bucketed
+-- the bucketed table is not merged and the table which is not bucketed is
+
+CREATE TABLE bucketed_table(key INT, value STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS;
+CREATE TABLE unbucketed_table(key INT, value STRING);
+
+EXPLAIN EXTENDED
+FROM src
+INSERT OVERWRITE TABLE bucketed_table SELECT key, value
+INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key;
+
+FROM src
+INSERT OVERWRITE TABLE bucketed_table SELECT key, value
+INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key;
+
+DESC FORMATTED bucketed_table;
+
+SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 1 OUT OF 2) s LIMIT 10;
+SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 2 OUT OF 2) s LIMIT 10;
+
+-- Should be 2 (not merged)
+SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM bucketed_table;
+
+-- Should be 1 (merged)
+SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM unbucketed_table;

Added: hive/trunk/ql/src/test/results/clientpositive/bucket5.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket5.q.out?rev=1423777&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket5.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket5.q.out Wed Dec 19 07:58:40 2012
@@ -0,0 +1,584 @@
+PREHOOK: query: -- Tests that when a multi insert inserts into a bucketed table and a table which is not bucketed
+-- the bucketed table is not merged and the table which is not bucketed is
+
+CREATE TABLE bucketed_table(key INT, value STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Tests that when a multi insert inserts into a bucketed table and a table which is not bucketed
+-- the bucketed table is not merged and the table which is not bucketed is
+
+CREATE TABLE bucketed_table(key INT, value STRING) CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@bucketed_table
+PREHOOK: query: CREATE TABLE unbucketed_table(key INT, value STRING)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE unbucketed_table(key INT, value STRING)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@unbucketed_table
+PREHOOK: query: EXPLAIN EXTENDED
+FROM src
+INSERT OVERWRITE TABLE bucketed_table SELECT key, value
+INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN EXTENDED
+FROM src
+INSERT OVERWRITE TABLE bucketed_table SELECT key, value
+INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME bucketed_table))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value)))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME unbucketed_table))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_CLUSTERBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+  Stage-2 is a root stage
+  Stage-0 depends on stages: Stage-2
+  Stage-3 depends on stages: Stage-0
+  Stage-4 depends on stages: Stage-2
+  Stage-10 depends on stages: Stage-4 , consists of Stage-7, Stage-6, Stage-8
+  Stage-7
+  Stage-1 depends on stages: Stage-7, Stage-6, Stage-9
+  Stage-5 depends on stages: Stage-1
+  Stage-6
+  Stage-8
+  Stage-9 depends on stages: Stage-8
+
+STAGE PLANS:
+  Stage: Stage-2
+    Map Reduce
+      Alias -> Map Operator Tree:
+        src 
+          TableScan
+            alias: src
+            GatherStats: false
+            Select Operator
+              expressions:
+                    expr: key
+                    type: string
+                    expr: value
+                    type: string
+              outputColumnNames: _col0, _col1
+              Reduce Output Operator
+                key expressions:
+                      expr: UDFToInteger(_col0)
+                      type: int
+                sort order: +
+                Map-reduce partition columns:
+                      expr: UDFToInteger(_col0)
+                      type: int
+                tag: -1
+                value expressions:
+                      expr: _col0
+                      type: string
+                      expr: _col1
+                      type: string
+            Select Operator
+              expressions:
+                    expr: key
+                    type: string
+                    expr: value
+                    type: string
+              outputColumnNames: _col0, _col1
+              File Output Operator
+                compressed: false
+                GlobalTableId: 0
+#### A masked pattern was here ####
+                NumFilesPerFileSink: 1
+                table:
+                    input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                    output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                    properties:
+                      columns _col0,_col1
+                      columns.types string,string
+                      escape.delim \
+                TotalFiles: 1
+                GatherStats: false
+                MultiFileSpray: false
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            base file name: src
+            input format: org.apache.hadoop.mapred.TextInputFormat
+            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+            properties:
+              bucket_count -1
+              columns key,value
+              columns.types string:string
+#### A masked pattern was here ####
+              name default.src
+              numFiles 1
+              numPartitions 0
+              numRows 0
+              rawDataSize 0
+              serialization.ddl struct src { string key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              totalSize 5812
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+          
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types string:string
+#### A masked pattern was here ####
+                name default.src
+                numFiles 1
+                numPartitions 0
+                numRows 0
+                rawDataSize 0
+                serialization.ddl struct src { string key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                totalSize 5812
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.src
+            name: default.src
+      Reduce Operator Tree:
+        Extract
+          Select Operator
+            expressions:
+                  expr: UDFToInteger(_col0)
+                  type: int
+                  expr: _col1
+                  type: string
+            outputColumnNames: _col0, _col1
+            File Output Operator
+              compressed: false
+              GlobalTableId: 1
+#### A masked pattern was here ####
+              NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    SORTBUCKETCOLSPREFIX TRUE
+                    bucket_count 2
+                    bucket_field_name key
+                    columns key,value
+                    columns.types int:string
+#### A masked pattern was here ####
+                    name default.bucketed_table
+                    serialization.ddl struct bucketed_table { i32 key, string value}
+                    serialization.format 1
+                    serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  name: default.bucketed_table
+              TotalFiles: 1
+              GatherStats: true
+              MultiFileSpray: false
+      Truncated Path -> Alias:
+        /src [src]
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: true
+#### A masked pattern was here ####
+          table:
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                SORTBUCKETCOLSPREFIX TRUE
+                bucket_count 2
+                bucket_field_name key
+                columns key,value
+                columns.types int:string
+#### A masked pattern was here ####
+                name default.bucketed_table
+                serialization.ddl struct bucketed_table { i32 key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.bucketed_table
+#### A masked pattern was here ####
+
+  Stage: Stage-3
+    Stats-Aggr Operator
+#### A masked pattern was here ####
+
+  Stage: Stage-4
+    Map Reduce
+      Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+            Reduce Output Operator
+              key expressions:
+                    expr: _col0
+                    type: string
+              sort order: +
+              Map-reduce partition columns:
+                    expr: _col0
+                    type: string
+              tag: -1
+              value expressions:
+                    expr: _col0
+                    type: string
+                    expr: _col1
+                    type: string
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            base file name: -mr-10004
+            input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+            output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+            properties:
+              columns _col0,_col1
+              columns.types string,string
+              escape.delim \
+          
+              input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+              properties:
+                columns _col0,_col1
+                columns.types string,string
+                escape.delim \
+      Reduce Operator Tree:
+        Extract
+          Select Operator
+            expressions:
+                  expr: UDFToInteger(_col0)
+                  type: int
+                  expr: _col1
+                  type: string
+            outputColumnNames: _col0, _col1
+            File Output Operator
+              compressed: false
+              GlobalTableId: 2
+#### A masked pattern was here ####
+              NumFilesPerFileSink: 1
+#### A masked pattern was here ####
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    bucket_count -1
+                    columns key,value
+                    columns.types int:string
+#### A masked pattern was here ####
+                    name default.unbucketed_table
+                    serialization.ddl struct unbucketed_table { i32 key, string value}
+                    serialization.format 1
+                    serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  name: default.unbucketed_table
+              TotalFiles: 1
+              GatherStats: true
+              MultiFileSpray: false
+      Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+  Stage: Stage-10
+    Conditional Operator
+
+  Stage: Stage-7
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+  Stage: Stage-1
+    Move Operator
+      tables:
+          replace: true
+#### A masked pattern was here ####
+          table:
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types int:string
+#### A masked pattern was here ####
+                name default.unbucketed_table
+                serialization.ddl struct unbucketed_table { i32 key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.unbucketed_table
+#### A masked pattern was here ####
+
+  Stage: Stage-5
+    Stats-Aggr Operator
+#### A masked pattern was here ####
+
+  Stage: Stage-6
+    Map Reduce
+      Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+            File Output Operator
+              compressed: false
+              GlobalTableId: 0
+#### A masked pattern was here ####
+              NumFilesPerFileSink: 1
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    bucket_count -1
+                    columns key,value
+                    columns.types int:string
+#### A masked pattern was here ####
+                    name default.unbucketed_table
+                    serialization.ddl struct unbucketed_table { i32 key, string value}
+                    serialization.format 1
+                    serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  name: default.unbucketed_table
+              TotalFiles: 1
+              GatherStats: false
+              MultiFileSpray: false
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            base file name: -ext-10005
+            input format: org.apache.hadoop.mapred.TextInputFormat
+            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+            properties:
+              bucket_count -1
+              columns key,value
+              columns.types int:string
+#### A masked pattern was here ####
+              name default.unbucketed_table
+              serialization.ddl struct unbucketed_table { i32 key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+          
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types int:string
+#### A masked pattern was here ####
+                name default.unbucketed_table
+                serialization.ddl struct unbucketed_table { i32 key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.unbucketed_table
+            name: default.unbucketed_table
+      Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+  Stage: Stage-8
+    Map Reduce
+      Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+            File Output Operator
+              compressed: false
+              GlobalTableId: 0
+#### A masked pattern was here ####
+              NumFilesPerFileSink: 1
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    bucket_count -1
+                    columns key,value
+                    columns.types int:string
+#### A masked pattern was here ####
+                    name default.unbucketed_table
+                    serialization.ddl struct unbucketed_table { i32 key, string value}
+                    serialization.format 1
+                    serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  name: default.unbucketed_table
+              TotalFiles: 1
+              GatherStats: false
+              MultiFileSpray: false
+      Needs Tagging: false
+      Path -> Alias:
+#### A masked pattern was here ####
+      Path -> Partition:
+#### A masked pattern was here ####
+          Partition
+            base file name: -ext-10005
+            input format: org.apache.hadoop.mapred.TextInputFormat
+            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+            properties:
+              bucket_count -1
+              columns key,value
+              columns.types int:string
+#### A masked pattern was here ####
+              name default.unbucketed_table
+              serialization.ddl struct unbucketed_table { i32 key, string value}
+              serialization.format 1
+              serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+          
+              input format: org.apache.hadoop.mapred.TextInputFormat
+              output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+              properties:
+                bucket_count -1
+                columns key,value
+                columns.types int:string
+#### A masked pattern was here ####
+                name default.unbucketed_table
+                serialization.ddl struct unbucketed_table { i32 key, string value}
+                serialization.format 1
+                serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+#### A masked pattern was here ####
+              serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+              name: default.unbucketed_table
+            name: default.unbucketed_table
+      Truncated Path -> Alias:
+#### A masked pattern was here ####
+
+  Stage: Stage-9
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+
+PREHOOK: query: FROM src
+INSERT OVERWRITE TABLE bucketed_table SELECT key, value
+INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@bucketed_table
+PREHOOK: Output: default@unbucketed_table
+POSTHOOK: query: FROM src
+INSERT OVERWRITE TABLE bucketed_table SELECT key, value
+INSERT OVERWRITE TABLE unbucketed_table SELECT key, value cluster by key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@bucketed_table
+POSTHOOK: Output: default@unbucketed_table
+POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: DESC FORMATTED bucketed_table
+PREHOOK: type: DESCTABLE
+POSTHOOK: query: DESC FORMATTED bucketed_table
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+# col_name            	data_type           	comment             
+	 	 
+key                 	int                 	None                
+value               	string              	None                
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+#### A masked pattern was here ####
+Protect Mode:       	None                	 
+Retention:          	0                   	 
+#### A masked pattern was here ####
+Table Type:         	MANAGED_TABLE       	 
+Table Parameters:	 	 
+	SORTBUCKETCOLSPREFIX	TRUE                
+	numFiles            	2                   
+	numPartitions       	0                   
+	numRows             	0                   
+	rawDataSize         	0                   
+	totalSize           	5812                
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	2                   	 
+Bucket Columns:     	[key]               	 
+Sort Columns:       	[Order(col:key, order:1)]	 
+Storage Desc Params:	 	 
+	serialization.format	1                   
+PREHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 1 OUT OF 2) s LIMIT 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucketed_table
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 1 OUT OF 2) s LIMIT 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucketed_table
+#### A masked pattern was here ####
+POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0	val_0
+0	val_0
+0	val_0
+2	val_2
+4	val_4
+8	val_8
+10	val_10
+12	val_12
+12	val_12
+18	val_18
+PREHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 2 OUT OF 2) s LIMIT 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucketed_table
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM bucketed_table TABLESAMPLE (BUCKET 2 OUT OF 2) s LIMIT 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucketed_table
+#### A masked pattern was here ####
+POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+5	val_5
+5	val_5
+5	val_5
+9	val_9
+11	val_11
+15	val_15
+15	val_15
+17	val_17
+19	val_19
+27	val_27
+PREHOOK: query: -- Should be 2 (not merged)
+SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM bucketed_table
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bucketed_table
+#### A masked pattern was here ####
+POSTHOOK: query: -- Should be 2 (not merged)
+SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM bucketed_table
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@bucketed_table
+#### A masked pattern was here ####
+POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+2
+PREHOOK: query: -- Should be 1 (merged)
+SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM unbucketed_table
+PREHOOK: type: QUERY
+PREHOOK: Input: default@unbucketed_table
+#### A masked pattern was here ####
+POSTHOOK: query: -- Should be 1 (merged)
+SELECT COUNT(DISTINCT INPUT__FILE__NAME) FROM unbucketed_table
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@unbucketed_table
+#### A masked pattern was here ####
+POSTHOOK: Lineage: bucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: bucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: unbucketed_table.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+1