You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2011/04/30 00:51:47 UTC

svn commit: r1097980 [1/11] - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/ conf/ ql/src/java/org/apache/hadoop/hive/ql/io/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org/apache/hadoop/hive/ql/parse/ ql/src/java/org/apa...

Author: namit
Date: Fri Apr 29 22:51:44 2011
New Revision: 1097980

URL: http://svn.apache.org/viewvc?rev=1097980&view=rev
Log:
HIVE-2121 Input Sampling By Splits
(Siying Dong via namit)


Added:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java
    hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q
    hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q
    hive/trunk/ql/src/test/queries/clientpositive/split_sample.q
    hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out
    hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out
    hive/trunk/ql/src/test/results/clientpositive/split_sample.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/conf/hive-default.xml
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
    hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin1.q.out
    hive/trunk/ql/src/test/results/clientpositive/disable_merge_for_bucketing.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample1.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample10.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample2.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample3.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample4.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample5.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample6.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample7.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample8.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample9.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample1.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample2.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample3.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample4.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample5.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample6.q.out
    hive/trunk/ql/src/test/results/compiler/parse/sample7.q.out
    hive/trunk/ql/src/test/results/compiler/plan/case_sensitivity.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/cast1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/groupby1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/groupby4.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/groupby5.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/groupby6.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input2.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input20.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input3.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input4.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input5.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input6.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input7.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input8.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input9.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input_part1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input_testsequencefile.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input_testxpath.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/input_testxpath2.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join2.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join3.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join4.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join5.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join6.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join7.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/join8.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample2.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample3.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample4.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample5.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample6.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/sample7.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/subq.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/udf1.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/udf4.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/udf6.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/udf_case.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/udf_when.q.xml
    hive/trunk/ql/src/test/results/compiler/plan/union.q.xml
    hive/trunk/shims/src/0.20/java/org/apache/hadoop/hive/shims/Hadoop20Shims.java
    hive/trunk/shims/src/0.20S/java/org/apache/hadoop/hive/shims/Hadoop20SShims.java
    hive/trunk/shims/src/common/java/org/apache/hadoop/hive/shims/HadoopShims.java

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri Apr 29 22:51:44 2011
@@ -303,6 +303,10 @@ public class HiveConf extends Configurat
 
     //small table file size
     HIVESMALLTABLESFILESIZE("hive.smalltable.filesize",25000000L), //25M
+
+    // random number for split sampling
+    HIVESAMPLERANDOMNUM("hive.sample.seednumber", 0),
+
     // test mode in hive mode
     HIVETESTMODE("hive.test.mode", false),
     HIVETESTMODEPREFIX("hive.test.mode.prefix", "test_"),

Modified: hive/trunk/conf/hive-default.xml
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml (original)
+++ hive/trunk/conf/hive-default.xml Fri Apr 29 22:51:44 2011
@@ -1067,4 +1067,11 @@
     rebuild work. This is very helpful for tables with thousands of partitions.</description>
 </property>
 
+<property>
+  <name>hive.sample.seednumber</name>
+  <value>0</value>
+  <description>A number used to percentage sampling. By changing this number, user will change the subsets
+   of data sampled.</description>
+</property>
+
 </configuration>

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java Fri Apr 29 22:51:44 2011
@@ -24,10 +24,10 @@ import java.io.File;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
-import java.util.HashMap;
 import java.util.Map;
 import java.util.Queue;
 import java.util.Set;
@@ -38,12 +38,14 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.parse.SplitSample;
 import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.shims.ShimLoader;
 import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim;
 import org.apache.hadoop.hive.shims.HadoopShims.InputSplitShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
@@ -53,7 +55,6 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.TextInputFormat;
-import org.apache.hadoop.hive.ql.exec.Operator;
 
 /**
  * CombineHiveInputFormat is a parameterized InputFormat which looks at the path
@@ -211,13 +212,18 @@ public class CombineHiveInputFormat<K ex
 
       out.writeUTF(inputFormatClassName);
     }
+
+    @Override
+    public void shrinkSplit(long length) {
+      inputSplitShim.shrinkSplit(length);
+    }
   }
 
   // Splits are not shared across different partitions with different input formats.
   // For example, 2 partitions (1 sequencefile and 1 rcfile) will have 2 different splits
   private static class CombinePathInputFormat {
-    private List<Operator<? extends Serializable>> opList;
-    private String inputFormatClassName;
+    private final List<Operator<? extends Serializable>> opList;
+    private final String inputFormatClassName;
 
     public CombinePathInputFormat(List<Operator<? extends Serializable>> opList,
                                   String inputFormatClassName) {
@@ -225,6 +231,7 @@ public class CombineHiveInputFormat<K ex
       this.inputFormatClassName = inputFormatClassName;
     }
 
+    @Override
     public boolean equals(Object o) {
       if (o instanceof CombinePathInputFormat) {
         CombinePathInputFormat mObj = (CombinePathInputFormat)o;
@@ -248,7 +255,6 @@ public class CombineHiveInputFormat<K ex
    */
   @Override
   public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
-
     init(job);
     Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
     Map<String, Operator<? extends Serializable>> aliasToWork =
@@ -344,7 +350,7 @@ public class CombineHiveInputFormat<K ex
       boolean done = false;
 
       if (!mrwork.isMapperCannotSpanPartns()) {
-        opList = HiveFileFormatUtils.doGetAliasesFromPath(
+        opList = HiveFileFormatUtils.doGetWorksFromPath(
                    pathToAliases, aliasToWork, filterPath);
         f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
       }
@@ -375,6 +381,11 @@ public class CombineHiveInputFormat<K ex
     }
 
     InputSplitShim[] iss = combine.getSplits(job, 1);
+
+    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
+      iss = sampleSplits(iss);
+    }
+
     for (InputSplitShim is : iss) {
       CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is);
       result.add(csplit);
@@ -385,6 +396,88 @@ public class CombineHiveInputFormat<K ex
   }
 
   /**
+   * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
+   *
+   * First, splits are grouped by alias they are for. If one split serves more than one
+   * alias or not for any sampled alias, we just directly add it to returned list.
+   * Then we find a list of exclusive splits for every alias to be sampled.
+   * For each alias, we start from position of seedNumber%totalNumber, and keep add
+   * splits until the total size hits percentage.
+   * @param splits
+   * @return the sampled splits
+   */
+  private InputSplitShim[] sampleSplits(InputSplitShim[] splits) {
+    HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
+    List<InputSplitShim> retLists = new ArrayList<InputSplitShim>();
+    Map<String, ArrayList<InputSplitShim>> aliasToSplitList = new HashMap<String, ArrayList<InputSplitShim>>();
+    Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
+
+    // Populate list of exclusive splits for every sampled alias
+    //
+    for (InputSplitShim split : splits) {
+      String alias = null;
+      for (Path path : split.getPaths()) {
+        List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(
+            pathToAliases, path);
+        // a path for a split unqualified the split from being sampled if:
+        // 1. it serves more than one alias
+        // 2. the alias it serves is not sampled
+        // 3. it serves different alias than another path for the same split
+        if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) ||
+            (alias != null && l.get(0) != alias)) {
+          alias = null;
+          break;
+        }
+        alias = l.get(0);
+      }
+
+      if (alias != null) {
+        // split exclusively serves alias, which needs to be sampled
+        // add it to the split list of the alias.
+        if (!aliasToSplitList.containsKey(alias)) {
+          aliasToSplitList.put(alias, new ArrayList<InputSplitShim>());
+        }
+        aliasToSplitList.get(alias).add(split);
+      } else {
+        // The split doesn't exclusively serve one alias
+        retLists.add(split);
+      }
+    }
+
+    // for every sampled alias, we figure out splits to be sampled and add
+    // them to return list
+    //
+    for (Map.Entry<String, ArrayList<InputSplitShim>> entry: aliasToSplitList.entrySet()) {
+      ArrayList<InputSplitShim> splitList = entry.getValue();
+      long totalSize = 0;
+      for (InputSplitShim split : splitList) {
+        totalSize += split.getLength();
+      }
+
+      long targetSize = (long) (totalSize * nameToSamples.get(entry.getKey()).getPercent() / 100D);
+      int startIndex = nameToSamples.get(entry.getKey()).getSeedNum() % splitList.size();
+      int size = 0;
+      for (int i = 0; i < splitList.size(); i++) {
+        InputSplitShim split = splitList.get((startIndex + i) % splitList.size());
+        retLists.add(split);
+        long splitgLength = split.getLength();
+        if (size + splitgLength >= targetSize) {
+          LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
+          if (size + splitgLength > targetSize) {
+            split.shrinkSplit(targetSize - size);
+          }
+          break;
+        }
+        size += splitgLength;
+      }
+
+    }
+
+    InputSplitShim[] retArray = new InputSplitShim[retLists.size()];
+    return retLists.toArray(retArray);
+  }
+
+  /**
    * Create a generic Hive RecordReader than can iterate over all chunks in a
    * CombinedFileSplit.
    */
@@ -417,7 +510,7 @@ public class CombineHiveInputFormat<K ex
   }
 
   static class CombineFilter implements PathFilter {
-    private List<String> pStrings = new ArrayList<String>();
+    private final List<String> pStrings = new ArrayList<String>();
 
     // store a path prefix in this TestFilter
     // PRECONDITION: p should always be a directory

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java Fri Apr 29 22:51:44 2011
@@ -22,8 +22,8 @@ import java.io.File;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.List;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
@@ -32,15 +32,15 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
 import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
 import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
-import org.apache.hadoop.io.SequenceFile.CompressionType;
 import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.InputFormat;
@@ -384,22 +384,34 @@ public final class HiveFileFormatUtils {
    * @param aliasToWork    The operator tree to be invoked for a given alias
    * @param dir            The path to look for
    **/
-  public static List<Operator<? extends Serializable>> doGetAliasesFromPath(
+  public static List<Operator<? extends Serializable>> doGetWorksFromPath(
     Map<String, ArrayList<String>> pathToAliases,
     Map<String, Operator<? extends Serializable>> aliasToWork, Path dir) {
     List<Operator<? extends Serializable>> opList =
       new ArrayList<Operator<? extends Serializable>>();
-    if (pathToAliases == null) {
-      return opList;
-    }
-    String path = getMatchingPath(pathToAliases, dir);
-    List<String> aliases = pathToAliases.get(path);
+
+    List<String> aliases = doGetAliasesFromPath(pathToAliases, dir);
     for (String alias : aliases) {
       opList.add(aliasToWork.get(alias));
     }
     return opList;
   }
 
+  /**
+   * Get the list of aliases from the opeerator tree that are needed for the path
+   * @param pathToAliases  mapping from path to aliases
+   * @param dir            The path to look for
+   **/
+  public static List<String> doGetAliasesFromPath(
+    Map<String, ArrayList<String>> pathToAliases,
+    Path dir) {
+    if (pathToAliases == null) {
+      return new ArrayList<String>();
+    }
+    String path = getMatchingPath(pathToAliases, dir);
+    return pathToAliases.get(path);
+  }
+
   private HiveFileFormatUtils() {
     // prevent instantiation
   }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java Fri Apr 29 22:51:44 2011
@@ -55,7 +55,6 @@ import org.apache.hadoop.hive.ql.parse.S
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory;
 import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles;
-import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx;
 import org.apache.hadoop.hive.ql.plan.ConditionalWork;
 import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
 import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
@@ -72,6 +71,7 @@ import org.apache.hadoop.hive.ql.plan.Re
 import org.apache.hadoop.hive.ql.plan.StatsWork;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
 import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 
 /**
@@ -425,7 +425,7 @@ public class GenMRFileSink1 implements N
     aliases.add(inputDir); // dummy alias: just use the input path
 
     // constructing the default MapredWork
-    MapredWork cplan = GenMapRedUtils.getMapRedWork(conf);
+    MapredWork cplan = GenMapRedUtils.getMapRedWorkFromConf(conf);
     cplan.getPathToAliases().put(inputDir, aliases);
     cplan.getPathToPartitionInfo().put(inputDir, new PartitionDesc(tblDesc, null));
     cplan.setNumReduceTasks(0);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java Fri Apr 29 22:51:44 2011
@@ -33,11 +33,11 @@ import org.apache.hadoop.hive.ql.lib.Nod
 import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
 import org.apache.hadoop.hive.ql.metadata.Partition;
 import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
-import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
 import org.apache.hadoop.hive.ql.parse.ParseContext;
 import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
 import org.apache.hadoop.hive.ql.parse.QBParseInfo;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
 import org.apache.hadoop.hive.ql.plan.MapredWork;
 import org.apache.hadoop.hive.ql.plan.StatsWork;
 /**
@@ -63,7 +63,7 @@ public class GenMRTableScan1 implements 
     Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
 
     // create a dummy MapReduce task
-    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
     Task<? extends Serializable> currTask = TaskFactory.get(currWork, parseCtx.getConf());
     Operator<? extends Serializable> currTopOp = op;
     ctx.setCurrTask(currTask);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java Fri Apr 29 22:51:44 2011
@@ -28,7 +28,6 @@ import java.util.Stack;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.Context;
 import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
 import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.exec.OperatorFactory;
 import org.apache.hadoop.hive.ql.exec.Task;
@@ -132,7 +131,7 @@ public class GenMRUnion1 implements Node
     // union is encountered for the first time
     if (uCtxTask == null) {
       uCtxTask = new GenMRUnionCtx();
-      uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+      uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
       uTask = TaskFactory.get(uPlan, parseCtx.getConf());
       uCtxTask.setUTask(uTask);
       ctx.setUnionTask(union, uCtxTask);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java Fri Apr 29 22:51:44 2011
@@ -60,16 +60,16 @@ import org.apache.hadoop.hive.ql.parse.R
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.FetchWork;
 import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
-import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
 import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
 import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
-import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext;
 import org.apache.hadoop.hive.ql.plan.MapredWork;
 import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 import org.apache.hadoop.hive.ql.plan.PlanUtils;
 import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
 import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
+import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext;
 
 /**
  * General utility common functions for the Processor to convert operator into
@@ -474,7 +474,7 @@ public final class GenMapRedUtils {
   throws SemanticException {
     // Generate a new task
     ParseContext parseCtx = opProcCtx.getParseCtx();
-    MapredWork cplan = getMapRedWork(parseCtx.getConf());
+    MapredWork cplan = getMapRedWork(parseCtx);
     Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx
         .getConf());
     Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
@@ -544,6 +544,8 @@ public final class GenMapRedUtils {
 
     PrunedPartitionList partsList = pList;
 
+    plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
+
     if (partsList == null) {
       try {
         partsList = parseCtx.getOpToPartList().get((TableScanOperator)topOp);
@@ -836,9 +838,21 @@ public final class GenMapRedUtils {
    *
    * @return the new plan
    */
-  public static MapredWork getMapRedWork(HiveConf conf) {
+  public static MapredWork getMapRedWork(ParseContext parseCtx) {
+    MapredWork work = getMapRedWorkFromConf(parseCtx.getConf());
+    work.setNameToSplitSample(parseCtx.getNameToSplitSample());
+    return work;
+  }
+
+  /**
+   * create a new plan and return. The pan won't contain the name to split
+   * sample information in parse context.
+   *
+   * @return the new plan
+   */
+  public static MapredWork getMapRedWorkFromConf(HiveConf conf) {
     MapredWork work = new MapredWork();
-    // This code has been only added for testing
+
     boolean mapperCannotSpanPartns =
       conf.getBoolVar(
         HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
@@ -1020,7 +1034,7 @@ public final class GenMapRedUtils {
     // union is encountered for the first time
     if (uCtxTask == null) {
       uCtxTask = new GenMRUnionCtx();
-      uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+      uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
       uTask = TaskFactory.get(uPlan, parseCtx.getConf());
       uCtxTask.setUTask(uTask);
       ctx.setUnionTask(union, uCtxTask);

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java Fri Apr 29 22:51:44 2011
@@ -127,7 +127,7 @@ public final class MapJoinFactory {
       GenMRProcContext opProcCtx = (GenMRProcContext) procCtx;
 
       ParseContext parseCtx = opProcCtx.getParseCtx();
-      MapredWork cplan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+      MapredWork cplan = GenMapRedUtils.getMapRedWork(parseCtx);
       Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx
           .getConf());
       Task<? extends Serializable> currTask = opProcCtx.getCurrTask();
@@ -211,7 +211,7 @@ public final class MapJoinFactory {
         ctx.setMapJoinCtx(mapJoin, mjCtx);
       }
 
-      MapredWork mjPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+      MapredWork mjPlan = GenMapRedUtils.getMapRedWork(parseCtx);
       Task<? extends Serializable> mjTask = TaskFactory.get(mjPlan, parseCtx
           .getConf());
 

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g Fri Apr 29 22:51:44 2011
@@ -173,7 +173,8 @@ TOK_ALTERTABLE_CLUSTER_SORT;
 TOK_TABCOLNAME;
 TOK_TABLELOCATION;
 TOK_PARTITIONLOCATION;
-TOK_TABLESAMPLE;
+TOK_TABLEBUCKETSAMPLE;
+TOK_TABLESPLITSAMPLE;
 TOK_TMP_FILE;
 TOK_TABSORTCOLNAMEASC;
 TOK_TABSORTCOLNAMEDESC;
@@ -1620,12 +1621,27 @@ fromSource
     (tableSource | subQuerySource) (lateralView^)*
     ;
 
+tableBucketSample
+@init { msgs.push("table bucket sample specification"); }
+@after { msgs.pop(); }
+    :
+    KW_TABLESAMPLE LPAREN KW_BUCKET (numerator=Number) KW_OUT KW_OF (denominator=Number) (KW_ON expr+=expression (COMMA expr+=expression)*)? RPAREN -> ^(TOK_TABLEBUCKETSAMPLE $numerator $denominator $expr*)
+    ;
+
+splitSample
+@init { msgs.push("table split sample specification"); }
+@after { msgs.pop(); }
+    :
+    KW_TABLESAMPLE LPAREN  (numerator=Number) KW_PERCENT RPAREN -> ^(TOK_TABLESPLITSAMPLE $numerator)
+    ;
+    
 tableSample
 @init { msgs.push("table sample specification"); }
 @after { msgs.pop(); }
     :
-    KW_TABLESAMPLE LPAREN KW_BUCKET (numerator=Number) KW_OUT KW_OF (denominator=Number) (KW_ON expr+=expression (COMMA expr+=expression)*)? RPAREN -> ^(TOK_TABLESAMPLE $numerator $denominator $expr*)
-    ;
+    tableBucketSample |
+    splitSample
+    ;    
 
 tableSource
 @init { msgs.push("table source"); }
@@ -2172,6 +2188,7 @@ KW_TABLESAMPLE: 'TABLESAMPLE';
 KW_BUCKET: 'BUCKET';
 KW_OUT: 'OUT';
 KW_OF: 'OF';
+KW_PERCENT: 'PERCENT';
 KW_CAST: 'CAST';
 KW_ADD: 'ADD';
 KW_REPLACE: 'REPLACE';

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java Fri Apr 29 22:51:44 2011
@@ -64,6 +64,7 @@ public class ParseContext {
   private Map<JoinOperator, QBJoinTree> joinContext;
   private Map<MapJoinOperator, QBJoinTree> mapJoinContext;
   private HashMap<TableScanOperator, Table> topToTable;
+  private HashMap<String, SplitSample> nameToSplitSample;
   private List<LoadTableDesc> loadTableWork;
   private List<LoadFileDesc> loadFileWork;
   private Context ctx;
@@ -146,7 +147,8 @@ public class ParseContext {
       Map<GroupByOperator, Set<String>> groupOpToInputTables,
       Map<String, PrunedPartitionList> prunedPartitions,
       HashMap<TableScanOperator, sampleDesc> opToSamplePruner,
-      SemanticAnalyzer.GlobalLimitCtx globalLimitCtx) {
+      SemanticAnalyzer.GlobalLimitCtx globalLimitCtx,
+      HashMap<String, SplitSample> nameToSplitSample) {
     this.conf = conf;
     this.qb = qb;
     this.ast = ast;
@@ -169,6 +171,7 @@ public class ParseContext {
     this.groupOpToInputTables = groupOpToInputTables;
     this.prunedPartitions = prunedPartitions;
     this.opToSamplePruner = opToSamplePruner;
+    this.nameToSplitSample = nameToSplitSample;
     this.globalLimitCtx = globalLimitCtx;
   }
 
@@ -318,6 +321,14 @@ public class ParseContext {
     this.opParseCtx = opParseCtx;
   }
 
+  public HashMap<String, SplitSample> getNameToSplitSample() {
+    return nameToSplitSample;
+  }
+
+  public void setNameToSplitSample(HashMap<String, SplitSample> nameToSplitSample) {
+    this.nameToSplitSample = nameToSplitSample;
+  }
+
   /**
    * @return the loadTableWork
    */

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Fri Apr 29 22:51:44 2011
@@ -70,6 +70,7 @@ import org.apache.hadoop.hive.ql.exec.Un
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.hooks.ReadEntity;
 import org.apache.hadoop.hive.ql.hooks.WriteEntity;
+import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
 import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
 import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
 import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
@@ -187,6 +188,11 @@ public class SemanticAnalyzer extends Ba
   private UnionProcContext uCtx;
   List<AbstractMapJoinOperator<? extends MapJoinDesc>> listMapJoinOpsNoReducer;
   private HashMap<TableScanOperator, sampleDesc> opToSamplePruner;
+  /**
+   * a map for the split sampling, from ailias to an instance of SplitSample
+   * that describes percentage and number.
+   */
+  private final HashMap<String, SplitSample> nameToSplitSample;
   Map<GroupByOperator, Set<String>> groupOpToInputTables;
   Map<String, PrunedPartitionList> prunedPartitions;
   private List<FieldSchema> resultSchema;
@@ -249,6 +255,7 @@ public class SemanticAnalyzer extends Ba
     opToPartPruner = new HashMap<TableScanOperator, ExprNodeDesc>();
     opToPartList = new HashMap<TableScanOperator, PrunedPartitionList>();
     opToSamplePruner = new HashMap<TableScanOperator, sampleDesc>();
+    nameToSplitSample = new HashMap<String, SplitSample>();
     topOps = new HashMap<String, Operator<? extends Serializable>>();
     topSelOps = new HashMap<String, Operator<? extends Serializable>>();
     loadTableWork = new ArrayList<LoadTableDesc>();
@@ -308,7 +315,7 @@ public class SemanticAnalyzer extends Ba
         topSelOps, opParseCtx, joinContext, topToTable, loadTableWork,
         loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
         listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
-        opToSamplePruner, globalLimitCtx);
+        opToSamplePruner, globalLimitCtx, nameToSplitSample);
   }
 
   @SuppressWarnings("nls")
@@ -428,21 +435,30 @@ public class SemanticAnalyzer extends Ba
     // and the alias (if alias is not present, the table name
     // is used as an alias)
     boolean tableSamplePresent = false;
+    boolean splitSamplePresent = false;
+
     int aliasIndex = 0;
     if (tabref.getChildCount() == 2) {
       // tablename tablesample
       // OR
       // tablename alias
       ASTNode ct = (ASTNode) tabref.getChild(1);
-      if (ct.getToken().getType() == HiveParser.TOK_TABLESAMPLE) {
+      if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) {
         tableSamplePresent = true;
+      } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) {
+        splitSamplePresent = true;
       } else {
         aliasIndex = 1;
       }
     } else if (tabref.getChildCount() == 3) {
       // table name table sample alias
       aliasIndex = 2;
-      tableSamplePresent = true;
+      ASTNode ct = (ASTNode) tabref.getChild(1);
+      if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) {
+        tableSamplePresent = true;
+      } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) {
+        splitSamplePresent = true;
+      }
     }
     ASTNode tableTree = (ASTNode) (tabref.getChild(0));
 
@@ -482,6 +498,23 @@ public class SemanticAnalyzer extends Ba
               .getChild(0));
         }
       }
+    } else if (splitSamplePresent) {
+      // only CombineHiveInputFormat supports this optimize
+      String inputFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEINPUTFORMAT);
+      if (!inputFormat.equals(
+        CombineHiveInputFormat.class.getName())) {
+        throw new SemanticException(
+            "Percentage sampling is not supported in " + inputFormat);
+      }
+      ASTNode sampleClause = (ASTNode) tabref.getChild(1);
+      String alias_id = getAliasId(alias, qb);
+      String strPercentage = unescapeIdentifier(sampleClause.getChild(0).getText());
+      Double percent = Double.valueOf(strPercentage).doubleValue();
+      if (percent < 0  || percent > 100) {
+        throw new SemanticException("Sampling percentage should be between 0 and 100.");
+      }
+      nameToSplitSample.put(alias_id, new SplitSample(
+          percent, conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM)));
     }
     // Insert this map into the stats
     qb.setTabAlias(alias, tabIdName);
@@ -5759,10 +5792,14 @@ public class SemanticAnalyzer extends Ba
     return equalsExpr;
   }
 
+  private String getAliasId(String alias, QB qb) {
+    return (qb.getId() == null ? alias : qb.getId() + ":" + alias);
+  }
+
   @SuppressWarnings("nls")
   private Operator genTablePlan(String alias, QB qb) throws SemanticException {
 
-    String alias_id = (qb.getId() == null ? alias : qb.getId() + ":" + alias);
+    String alias_id = getAliasId(alias, qb);
     Table tab = qb.getMetaData().getSrcForAlias(alias);
     RowResolver rwsch;
 
@@ -6356,17 +6393,19 @@ public class SemanticAnalyzer extends Ba
 
     // determine the query qualifies reduce input size for LIMIT
     // The query only qualifies when there are only one top operator
-    // and there is no transformer or UDTF
+    // and there is no transformer or UDTF and no block sampling
+    // is used.
     if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVELIMITOPTENABLE)
         && ctx.getTryCount() == 0 && topOps.size() == 1
-        && !globalLimitCtx.ifHasTransformOrUDTF()) {
+        && !globalLimitCtx.ifHasTransformOrUDTF() &&
+        nameToSplitSample.isEmpty()) {
 
       // Here we recursively check:
       // 1. whether there are exact one LIMIT in the query
       // 2. whether there is no aggregation, group-by, distinct, sort by,
       //    distributed by, or table sampling in any of the sub-query.
       // The query only qualifies if both conditions are satisfied.
-      // 
+      //
       // Example qualified queries:
       //    CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
       //    INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
@@ -6817,7 +6856,7 @@ public class SemanticAnalyzer extends Ba
         opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable,
         loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
         listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
-        opToSamplePruner, globalLimitCtx);
+        opToSamplePruner, globalLimitCtx, nameToSplitSample);
 
     Optimizer optm = new Optimizer();
     optm.setPctx(pCtx);

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java Fri Apr 29 22:51:44 2011
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+import java.io.Serializable;
+
+import org.apache.hadoop.hive.ql.plan.Explain;
+
+
+
+/**
+ *
+ * This class stores all the information specified in the TABLESAMPLE(PERCENT ...) clause.
+ * e.g. for the clause "FROM t TABLESAMPLE(1 PERCENT) it will store the percentage 1,
+ * and the seed number is to determine which 1%. Currently it is from the conf
+ * hive.sample.seednumber
+ *
+ */
+public class SplitSample implements Serializable{
+
+  private static final long serialVersionUID = 1L;
+
+  /**
+   * The percentage of the TABLESAMPLE clause.
+   */
+  private double percent;
+
+  /**
+   * The number used to determine which part of the input to sample
+   */
+  private int seedNum = 0;
+
+  public SplitSample() {
+  }
+
+
+  public SplitSample(double percent, int seedNum) {
+    this.percent = percent;
+    this.seedNum = seedNum;
+  }
+
+  @Explain(displayName = "percentage")
+  public double getPercent() {
+    return percent;
+  }
+
+  public void setPercent(double percent) {
+    this.percent = percent;
+  }
+
+  @Explain(displayName = "seed number")
+  public int getSeedNum() {
+    return seedNum;
+  }
+
+  public void setSeedNum(int seedNum) {
+    this.seedNum = seedNum;
+  }
+
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java Fri Apr 29 22:51:44 2011
@@ -21,6 +21,7 @@ package org.apache.hadoop.hive.ql.plan;
 import java.io.ByteArrayOutputStream;
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -29,6 +30,7 @@ import org.apache.hadoop.hive.ql.exec.Op
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.parse.OpParseContext;
 import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SplitSample;
 
 /**
  * MapredWork.
@@ -49,6 +51,8 @@ public class MapredWork implements Seria
 
   private LinkedHashMap<String, PartitionDesc> aliasToPartnInfo;
 
+  private HashMap<String, SplitSample> nameToSplitSample;
+
   // map<->reduce interface
   // schema of the map-reduce 'key' object - this is homogeneous
   private TableDesc keyDesc;
@@ -201,6 +205,15 @@ public class MapredWork implements Seria
     return reducer;
   }
 
+  @Explain(displayName = "Percentage Sample")
+  public HashMap<String, SplitSample> getNameToSplitSample() {
+    return nameToSplitSample;
+  }
+
+  public void setNameToSplitSample(HashMap<String, SplitSample> nameToSplitSample) {
+    this.nameToSplitSample = nameToSplitSample;
+  }
+
   public void setReducer(final Operator<?> reducer) {
     this.reducer = reducer;
   }

Added: hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q Fri Apr 29 22:51:44 2011
@@ -0,0 +1,3 @@
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+
+select key from src tablesample(105 percent);

Added: hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q Fri Apr 29 22:51:44 2011
@@ -0,0 +1,3 @@
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+
+select key from src tablesample(1 percent);

Added: hive/trunk/ql/src/test/queries/clientpositive/split_sample.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/split_sample.q?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/split_sample.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/split_sample.q Fri Apr 29 22:51:44 2011
@@ -0,0 +1,86 @@
+drop table ss_src1;
+drop table ss_src2;
+drop table ss_src3;
+drop table ss_i_part;
+drop table ss_t3;
+drop table ss_t4;
+drop table ss_t5;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+set mapred.max.split.size=300;
+set mapred.min.split.size=300;
+set mapred.min.split.size.per.node=300;
+set mapred.min.split.size.per.rack=300;
+set hive.merge.smallfiles.avgsize=1;
+
+-- create multiple file inputs (two enable multiple splits)
+create table ss_i_part (key int, value string) partitioned by (p string);
+insert overwrite table ss_i_part partition (p='1') select key, value from src;
+insert overwrite table ss_i_part partition (p='2') select key, value from src;
+insert overwrite table ss_i_part partition (p='3') select key, value from src;
+create table ss_src2 as select key, value from ss_i_part;
+select count(1) from ss_src2 tablesample(1 percent);
+
+-- sample first split
+desc ss_src2;
+set hive.sample.seednumber=0;
+explain select key, value from ss_src2 tablesample(1 percent) limit 10;
+select key, value from ss_src2 tablesample(1 percent) limit 10;
+
+-- verify seed number of sampling
+insert overwrite table ss_i_part partition (p='1') select key+10000, value from src;
+insert overwrite table ss_i_part partition (p='2') select key+20000, value from src;
+insert overwrite table ss_i_part partition (p='3') select key+30000, value from src;
+create table ss_src3 as select key, value from ss_i_part;
+set hive.sample.seednumber=3;
+create table ss_t3 as select sum(key) % 397 as s from ss_src3 tablesample(1 percent) limit 10;
+set hive.sample.seednumber=4;
+create table ss_t4 as select sum(key) % 397 as s from ss_src3 tablesample(1 percent) limit 10;
+set hive.sample.seednumber=5;
+create table ss_t5 as select sum(key) % 397 as s from ss_src3 tablesample(1 percent) limit 10;
+select sum(s) from (select s from ss_t3 union all select s from ss_t4 union all select s from ss_t5) t;
+
+-- sample more than one split
+explain select count(distinct key) from ss_src2 tablesample(70 percent) limit 10;
+select count(distinct key) from ss_src2 tablesample(70 percent) limit 10;
+
+-- sample all splits
+select count(1) from ss_src2 tablesample(100 percent);
+
+-- subquery
+explain select key from (select key from ss_src2 tablesample(1 percent) limit 10) subq;
+select key from (select key from ss_src2 tablesample(1 percent) limit 10) subq;
+
+-- groupby
+select key, count(1) from ss_src2 tablesample(1 percent) group by key order by key;
+
+-- sample one of two tables:
+create table ss_src1 as select * from ss_src2;
+select t2.key as k from ss_src1 join ss_src2 tablesample(1 percent) t2 on ss_src1.key=t2.key order by k;
+
+-- sample two tables
+explain select * from (
+select t1.key as k1, t2.key as k from ss_src1 tablesample(80 percent) t1 full outer join ss_src2 tablesample(2 percent) t2 on t1.key=t2.key
+) subq where k in (199, 10199, 20199) or k1 in (199, 10199, 20199);
+
+select * from (
+select t1.key as k1, t2.key as k from ss_src1 tablesample(80 percent) t1 full outer join ss_src2 tablesample(2 percent) t2 on t1.key=t2.key
+) subq where k in (199, 10199, 20199) or k1 in (199, 10199, 20199);
+
+-- shrink last split
+explain select count(1) from ss_src2 tablesample(1 percent);
+set mapred.max.split.size=300000;
+set mapred.min.split.size=300000;
+set mapred.min.split.size.per.node=300000;
+set mapred.min.split.size.per.rack=300000;
+select count(1) from ss_src2 tablesample(1 percent);
+select count(1) from ss_src2 tablesample(50 percent);
+
+
+drop table ss_src1;
+drop table ss_src2;
+drop table ss_src3;
+drop table ss_i_part;
+drop table ss_t3;
+drop table ss_t4;
+drop table ss_t5;

Added: hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out Fri Apr 29 22:51:44 2011
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: Sampling percentage should be between 0 and 100.

Added: hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out Fri Apr 29 22:51:44 2011
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: Percentage sampling is not supported in org.apache.hadoop.hive.ql.io.HiveInputFormat

Modified: hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out Fri Apr 29 22:51:44 2011
@@ -65,7 +65,7 @@ STAGE PLANS:
               serialization.ddl struct src { string key, string value}
               serialization.format 1
               serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-              transient_lastDdlTime 1297328964
+              transient_lastDdlTime 1303258260
             serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
           
               input format: org.apache.hadoop.mapred.TextInputFormat
@@ -81,7 +81,7 @@ STAGE PLANS:
                 serialization.ddl struct src { string key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297328964
+                transient_lastDdlTime 1303258260
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.src
             name: default.src
@@ -97,9 +97,9 @@ STAGE PLANS:
             File Output Operator
               compressed: false
               GlobalTableId: 1
-              directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000
+              directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000
               NumFilesPerFileSink: 1
-              Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000/
+              Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000/
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -115,7 +115,7 @@ STAGE PLANS:
                     serialization.ddl struct bucket1_1 { i32 key, string value}
                     serialization.format 1
                     serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                    transient_lastDdlTime 1297330232
+                    transient_lastDdlTime 1303259753
                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                   name: default.bucket1_1
               TotalFiles: 1
@@ -126,7 +126,7 @@ STAGE PLANS:
     Move Operator
       tables:
           replace: true
-          source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000
+          source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000
           table:
               input format: org.apache.hadoop.mapred.TextInputFormat
               output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -142,14 +142,14 @@ STAGE PLANS:
                 serialization.ddl struct bucket1_1 { i32 key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297330232
+                transient_lastDdlTime 1303259753
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.bucket1_1
-          tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10001
+          tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10001
 
   Stage: Stage-2
     Stats-Aggr Operator
-      Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000/
+      Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000/
 
 
 PREHOOK: query: insert overwrite table bucket1_1
@@ -167,11 +167,11 @@ POSTHOOK: Lineage: bucket1_1.value SIMPL
 PREHOOK: query: select * from bucket1_1 order by key
 PREHOOK: type: QUERY
 PREHOOK: Input: default@bucket1_1
-PREHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-37_416_6808991169590499902/-mr-10000
+PREHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-35-58_044_385144993656634200/-mr-10000
 POSTHOOK: query: select * from bucket1_1 order by key
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@bucket1_1
-POSTHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-37_416_6808991169590499902/-mr-10000
+POSTHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-35-58_044_385144993656634200/-mr-10000
 POSTHOOK: Lineage: bucket1_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket1_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 0	val_0

Modified: hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out Fri Apr 29 22:51:44 2011
@@ -65,7 +65,7 @@ STAGE PLANS:
               serialization.ddl struct src { string key, string value}
               serialization.format 1
               serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-              transient_lastDdlTime 1297328964
+              transient_lastDdlTime 1303258260
             serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
           
               input format: org.apache.hadoop.mapred.TextInputFormat
@@ -81,7 +81,7 @@ STAGE PLANS:
                 serialization.ddl struct src { string key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297328964
+                transient_lastDdlTime 1303258260
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.src
             name: default.src
@@ -97,9 +97,9 @@ STAGE PLANS:
             File Output Operator
               compressed: false
               GlobalTableId: 1
-              directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000
+              directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000
               NumFilesPerFileSink: 2
-              Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000/
+              Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000/
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -115,7 +115,7 @@ STAGE PLANS:
                     serialization.ddl struct bucket2_1 { i32 key, string value}
                     serialization.format 1
                     serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                    transient_lastDdlTime 1297330241
+                    transient_lastDdlTime 1303259761
                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                   name: default.bucket2_1
               TotalFiles: 2
@@ -126,7 +126,7 @@ STAGE PLANS:
     Move Operator
       tables:
           replace: true
-          source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000
+          source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000
           table:
               input format: org.apache.hadoop.mapred.TextInputFormat
               output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -142,14 +142,14 @@ STAGE PLANS:
                 serialization.ddl struct bucket2_1 { i32 key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297330241
+                transient_lastDdlTime 1303259761
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.bucket2_1
-          tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10001
+          tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10001
 
   Stage: Stage-2
     Stats-Aggr Operator
-      Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000/
+      Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000/
 
 
 PREHOOK: query: insert overwrite table bucket2_1
@@ -173,7 +173,7 @@ POSTHOOK: type: QUERY
 POSTHOOK: Lineage: bucket2_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket2_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 ABSTRACT SYNTAX TREE:
-  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket2_1) (TOK_TABLESAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket2_1) (TOK_TABLEBUCKETSAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
 
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
@@ -229,11 +229,11 @@ STAGE PLANS:
 PREHOOK: query: select * from bucket2_1 tablesample (bucket 1 out of 2) s order by key
 PREHOOK: type: QUERY
 PREHOOK: Input: default@bucket2_1
-PREHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-46_160_2130891703682723511/-mr-10000
+PREHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-06_239_4661085987132263426/-mr-10000
 POSTHOOK: query: select * from bucket2_1 tablesample (bucket 1 out of 2) s order by key
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@bucket2_1
-POSTHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-46_160_2130891703682723511/-mr-10000
+POSTHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-06_239_4661085987132263426/-mr-10000
 POSTHOOK: Lineage: bucket2_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket2_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 0	val_0

Modified: hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out Fri Apr 29 22:51:44 2011
@@ -65,7 +65,7 @@ STAGE PLANS:
               serialization.ddl struct src { string key, string value}
               serialization.format 1
               serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-              transient_lastDdlTime 1297328964
+              transient_lastDdlTime 1303258260
             serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
           
               input format: org.apache.hadoop.mapred.TextInputFormat
@@ -81,7 +81,7 @@ STAGE PLANS:
                 serialization.ddl struct src { string key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297328964
+                transient_lastDdlTime 1303258260
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.src
             name: default.src
@@ -97,10 +97,10 @@ STAGE PLANS:
             File Output Operator
               compressed: false
               GlobalTableId: 1
-              directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000
+              directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000
               NumFilesPerFileSink: 2
               Static Partition Specification: ds=1/
-              Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000/
+              Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000/
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -117,7 +117,7 @@ STAGE PLANS:
                     serialization.ddl struct bucket3_1 { i32 key, string value}
                     serialization.format 1
                     serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                    transient_lastDdlTime 1297330250
+                    transient_lastDdlTime 1303259769
                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                   name: default.bucket3_1
               TotalFiles: 2
@@ -130,7 +130,7 @@ STAGE PLANS:
           partition:
             ds 1
           replace: true
-          source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000
+          source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000
           table:
               input format: org.apache.hadoop.mapred.TextInputFormat
               output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -147,14 +147,14 @@ STAGE PLANS:
                 serialization.ddl struct bucket3_1 { i32 key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297330250
+                transient_lastDdlTime 1303259769
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.bucket3_1
-          tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10001
+          tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10001
 
   Stage: Stage-2
     Stats-Aggr Operator
-      Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000/
+      Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000/
 
 
 PREHOOK: query: insert overwrite table bucket3_1 partition (ds='1')
@@ -194,7 +194,7 @@ POSTHOOK: Lineage: bucket3_1 PARTITION(d
 POSTHOOK: Lineage: bucket3_1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket3_1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 ABSTRACT SYNTAX TREE:
-  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket3_1) (TOK_TABLESAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket3_1) (TOK_TABLEBUCKETSAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
 
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
@@ -258,11 +258,11 @@ STAGE PLANS:
 PREHOOK: query: select * from bucket3_1 tablesample (bucket 1 out of 2) s where ds = '1' order by key
 PREHOOK: type: QUERY
 PREHOOK: Input: default@bucket3_1@ds=1
-PREHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-31-00_746_8025378428702145900/-mr-10000
+PREHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-18_942_3752538522863517107/-mr-10000
 POSTHOOK: query: select * from bucket3_1 tablesample (bucket 1 out of 2) s where ds = '1' order by key
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@bucket3_1@ds=1
-POSTHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-31-00_746_8025378428702145900/-mr-10000
+POSTHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-18_942_3752538522863517107/-mr-10000
 POSTHOOK: Lineage: bucket3_1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket3_1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket3_1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]

Modified: hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out Fri Apr 29 22:51:44 2011
@@ -50,9 +50,9 @@ STAGE PLANS:
                       type: string
       Needs Tagging: false
       Path -> Alias:
-        hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src [src]
+        hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src [src]
       Path -> Partition:
-        hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src 
+        hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src 
           Partition
             base file name: src
             input format: org.apache.hadoop.mapred.TextInputFormat
@@ -63,12 +63,12 @@ STAGE PLANS:
               columns.types string:string
               file.inputformat org.apache.hadoop.mapred.TextInputFormat
               file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
-              location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src
+              location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src
               name default.src
               serialization.ddl struct src { string key, string value}
               serialization.format 1
               serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-              transient_lastDdlTime 1297926642
+              transient_lastDdlTime 1304060620
             serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
           
               input format: org.apache.hadoop.mapred.TextInputFormat
@@ -79,12 +79,12 @@ STAGE PLANS:
                 columns.types string:string
                 file.inputformat org.apache.hadoop.mapred.TextInputFormat
                 file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
-                location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src
+                location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src
                 name default.src
                 serialization.ddl struct src { string key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297926642
+                transient_lastDdlTime 1304060620
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.src
             name: default.src
@@ -100,9 +100,9 @@ STAGE PLANS:
             File Output Operator
               compressed: false
               GlobalTableId: 1
-              directory: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000
+              directory: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000
               NumFilesPerFileSink: 2
-              Stats Publishing Key Prefix: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000/
+              Stats Publishing Key Prefix: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000/
               table:
                   input format: org.apache.hadoop.mapred.TextInputFormat
                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -114,12 +114,12 @@ STAGE PLANS:
                     columns.types int:string
                     file.inputformat org.apache.hadoop.mapred.TextInputFormat
                     file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
-                    location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/bucket4_1
+                    location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/bucket4_1
                     name default.bucket4_1
                     serialization.ddl struct bucket4_1 { i32 key, string value}
                     serialization.format 1
                     serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                    transient_lastDdlTime 1297926644
+                    transient_lastDdlTime 1304060621
                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                   name: default.bucket4_1
               TotalFiles: 2
@@ -130,7 +130,7 @@ STAGE PLANS:
     Move Operator
       tables:
           replace: true
-          source: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000
+          source: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000
           table:
               input format: org.apache.hadoop.mapred.TextInputFormat
               output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -142,19 +142,19 @@ STAGE PLANS:
                 columns.types int:string
                 file.inputformat org.apache.hadoop.mapred.TextInputFormat
                 file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
-                location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/bucket4_1
+                location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/bucket4_1
                 name default.bucket4_1
                 serialization.ddl struct bucket4_1 { i32 key, string value}
                 serialization.format 1
                 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
-                transient_lastDdlTime 1297926644
+                transient_lastDdlTime 1304060621
               serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
               name: default.bucket4_1
-          tmp directory: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10001
+          tmp directory: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10001
 
   Stage: Stage-2
     Stats-Aggr Operator
-      Stats Aggregation Key Prefix: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000/
+      Stats Aggregation Key Prefix: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000/
 
 
 PREHOOK: query: insert overwrite table bucket4_1
@@ -178,7 +178,7 @@ POSTHOOK: type: QUERY
 POSTHOOK: Lineage: bucket4_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket4_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 ABSTRACT SYNTAX TREE:
-  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket4_1) (TOK_TABLESAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+  (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket4_1) (TOK_TABLEBUCKETSAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
 
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
@@ -221,11 +221,11 @@ STAGE PLANS:
 PREHOOK: query: select * from bucket4_1 tablesample (bucket 1 out of 2) s
 PREHOOK: type: QUERY
 PREHOOK: Input: default@bucket4_1
-PREHOOK: Output: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-11-09_440_4231562110595025861/-mr-10000
+PREHOOK: Output: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-04-06_999_3814228385258280857/-mr-10000
 POSTHOOK: query: select * from bucket4_1 tablesample (bucket 1 out of 2) s
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@bucket4_1
-POSTHOOK: Output: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-11-09_440_4231562110595025861/-mr-10000
+POSTHOOK: Output: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-04-06_999_3814228385258280857/-mr-10000
 POSTHOOK: Lineage: bucket4_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
 POSTHOOK: Lineage: bucket4_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
 0	val_0