You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2011/04/30 00:51:47 UTC
svn commit: r1097980 [1/11] - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/conf/ conf/
ql/src/java/org/apache/hadoop/hive/ql/io/
ql/src/java/org/apache/hadoop/hive/ql/optimizer/
ql/src/java/org/apache/hadoop/hive/ql/parse/ ql/src/java/org/apa...
Author: namit
Date: Fri Apr 29 22:51:44 2011
New Revision: 1097980
URL: http://svn.apache.org/viewvc?rev=1097980&view=rev
Log:
HIVE-2121 Input Sampling By Splits
(Siying Dong via namit)
Added:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java
hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q
hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q
hive/trunk/ql/src/test/queries/clientpositive/split_sample.q
hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out
hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out
hive/trunk/ql/src/test/results/clientpositive/split_sample.q.out
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/trunk/conf/hive-default.xml
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out
hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out
hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out
hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out
hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin1.q.out
hive/trunk/ql/src/test/results/clientpositive/disable_merge_for_bucketing.q.out
hive/trunk/ql/src/test/results/clientpositive/sample1.q.out
hive/trunk/ql/src/test/results/clientpositive/sample10.q.out
hive/trunk/ql/src/test/results/clientpositive/sample2.q.out
hive/trunk/ql/src/test/results/clientpositive/sample3.q.out
hive/trunk/ql/src/test/results/clientpositive/sample4.q.out
hive/trunk/ql/src/test/results/clientpositive/sample5.q.out
hive/trunk/ql/src/test/results/clientpositive/sample6.q.out
hive/trunk/ql/src/test/results/clientpositive/sample7.q.out
hive/trunk/ql/src/test/results/clientpositive/sample8.q.out
hive/trunk/ql/src/test/results/clientpositive/sample9.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample1.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample2.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample3.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample4.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample5.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample6.q.out
hive/trunk/ql/src/test/results/compiler/parse/sample7.q.out
hive/trunk/ql/src/test/results/compiler/plan/case_sensitivity.q.xml
hive/trunk/ql/src/test/results/compiler/plan/cast1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby4.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby5.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby6.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input2.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input20.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input3.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input4.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input5.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input6.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input7.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input8.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input9.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input_part1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input_testsequencefile.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input_testxpath.q.xml
hive/trunk/ql/src/test/results/compiler/plan/input_testxpath2.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join2.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join3.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join4.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join5.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join6.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join7.q.xml
hive/trunk/ql/src/test/results/compiler/plan/join8.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample2.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample3.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample4.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample5.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample6.q.xml
hive/trunk/ql/src/test/results/compiler/plan/sample7.q.xml
hive/trunk/ql/src/test/results/compiler/plan/subq.q.xml
hive/trunk/ql/src/test/results/compiler/plan/udf1.q.xml
hive/trunk/ql/src/test/results/compiler/plan/udf4.q.xml
hive/trunk/ql/src/test/results/compiler/plan/udf6.q.xml
hive/trunk/ql/src/test/results/compiler/plan/udf_case.q.xml
hive/trunk/ql/src/test/results/compiler/plan/udf_when.q.xml
hive/trunk/ql/src/test/results/compiler/plan/union.q.xml
hive/trunk/shims/src/0.20/java/org/apache/hadoop/hive/shims/Hadoop20Shims.java
hive/trunk/shims/src/0.20S/java/org/apache/hadoop/hive/shims/Hadoop20SShims.java
hive/trunk/shims/src/common/java/org/apache/hadoop/hive/shims/HadoopShims.java
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri Apr 29 22:51:44 2011
@@ -303,6 +303,10 @@ public class HiveConf extends Configurat
//small table file size
HIVESMALLTABLESFILESIZE("hive.smalltable.filesize",25000000L), //25M
+
+ // random number for split sampling
+ HIVESAMPLERANDOMNUM("hive.sample.seednumber", 0),
+
// test mode in hive mode
HIVETESTMODE("hive.test.mode", false),
HIVETESTMODEPREFIX("hive.test.mode.prefix", "test_"),
Modified: hive/trunk/conf/hive-default.xml
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml (original)
+++ hive/trunk/conf/hive-default.xml Fri Apr 29 22:51:44 2011
@@ -1067,4 +1067,11 @@
rebuild work. This is very helpful for tables with thousands of partitions.</description>
</property>
+<property>
+ <name>hive.sample.seednumber</name>
+ <value>0</value>
+ <description>A number used to percentage sampling. By changing this number, user will change the subsets
+ of data sampled.</description>
+</property>
+
</configuration>
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java Fri Apr 29 22:51:44 2011
@@ -24,10 +24,10 @@ import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
-import java.util.HashMap;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
@@ -38,12 +38,14 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim;
import org.apache.hadoop.hive.shims.HadoopShims.InputSplitShim;
-import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
@@ -53,7 +55,6 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
-import org.apache.hadoop.hive.ql.exec.Operator;
/**
* CombineHiveInputFormat is a parameterized InputFormat which looks at the path
@@ -211,13 +212,18 @@ public class CombineHiveInputFormat<K ex
out.writeUTF(inputFormatClassName);
}
+
+ @Override
+ public void shrinkSplit(long length) {
+ inputSplitShim.shrinkSplit(length);
+ }
}
// Splits are not shared across different partitions with different input formats.
// For example, 2 partitions (1 sequencefile and 1 rcfile) will have 2 different splits
private static class CombinePathInputFormat {
- private List<Operator<? extends Serializable>> opList;
- private String inputFormatClassName;
+ private final List<Operator<? extends Serializable>> opList;
+ private final String inputFormatClassName;
public CombinePathInputFormat(List<Operator<? extends Serializable>> opList,
String inputFormatClassName) {
@@ -225,6 +231,7 @@ public class CombineHiveInputFormat<K ex
this.inputFormatClassName = inputFormatClassName;
}
+ @Override
public boolean equals(Object o) {
if (o instanceof CombinePathInputFormat) {
CombinePathInputFormat mObj = (CombinePathInputFormat)o;
@@ -248,7 +255,6 @@ public class CombineHiveInputFormat<K ex
*/
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
-
init(job);
Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends Serializable>> aliasToWork =
@@ -344,7 +350,7 @@ public class CombineHiveInputFormat<K ex
boolean done = false;
if (!mrwork.isMapperCannotSpanPartns()) {
- opList = HiveFileFormatUtils.doGetAliasesFromPath(
+ opList = HiveFileFormatUtils.doGetWorksFromPath(
pathToAliases, aliasToWork, filterPath);
f = poolMap.get(new CombinePathInputFormat(opList, inputFormatClassName));
}
@@ -375,6 +381,11 @@ public class CombineHiveInputFormat<K ex
}
InputSplitShim[] iss = combine.getSplits(job, 1);
+
+ if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
+ iss = sampleSplits(iss);
+ }
+
for (InputSplitShim is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is);
result.add(csplit);
@@ -385,6 +396,88 @@ public class CombineHiveInputFormat<K ex
}
/**
+ * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
+ *
+ * First, splits are grouped by alias they are for. If one split serves more than one
+ * alias or not for any sampled alias, we just directly add it to returned list.
+ * Then we find a list of exclusive splits for every alias to be sampled.
+ * For each alias, we start from position of seedNumber%totalNumber, and keep add
+ * splits until the total size hits percentage.
+ * @param splits
+ * @return the sampled splits
+ */
+ private InputSplitShim[] sampleSplits(InputSplitShim[] splits) {
+ HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
+ List<InputSplitShim> retLists = new ArrayList<InputSplitShim>();
+ Map<String, ArrayList<InputSplitShim>> aliasToSplitList = new HashMap<String, ArrayList<InputSplitShim>>();
+ Map<String, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
+
+ // Populate list of exclusive splits for every sampled alias
+ //
+ for (InputSplitShim split : splits) {
+ String alias = null;
+ for (Path path : split.getPaths()) {
+ List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(
+ pathToAliases, path);
+ // a path for a split unqualified the split from being sampled if:
+ // 1. it serves more than one alias
+ // 2. the alias it serves is not sampled
+ // 3. it serves different alias than another path for the same split
+ if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) ||
+ (alias != null && l.get(0) != alias)) {
+ alias = null;
+ break;
+ }
+ alias = l.get(0);
+ }
+
+ if (alias != null) {
+ // split exclusively serves alias, which needs to be sampled
+ // add it to the split list of the alias.
+ if (!aliasToSplitList.containsKey(alias)) {
+ aliasToSplitList.put(alias, new ArrayList<InputSplitShim>());
+ }
+ aliasToSplitList.get(alias).add(split);
+ } else {
+ // The split doesn't exclusively serve one alias
+ retLists.add(split);
+ }
+ }
+
+ // for every sampled alias, we figure out splits to be sampled and add
+ // them to return list
+ //
+ for (Map.Entry<String, ArrayList<InputSplitShim>> entry: aliasToSplitList.entrySet()) {
+ ArrayList<InputSplitShim> splitList = entry.getValue();
+ long totalSize = 0;
+ for (InputSplitShim split : splitList) {
+ totalSize += split.getLength();
+ }
+
+ long targetSize = (long) (totalSize * nameToSamples.get(entry.getKey()).getPercent() / 100D);
+ int startIndex = nameToSamples.get(entry.getKey()).getSeedNum() % splitList.size();
+ int size = 0;
+ for (int i = 0; i < splitList.size(); i++) {
+ InputSplitShim split = splitList.get((startIndex + i) % splitList.size());
+ retLists.add(split);
+ long splitgLength = split.getLength();
+ if (size + splitgLength >= targetSize) {
+ LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
+ if (size + splitgLength > targetSize) {
+ split.shrinkSplit(targetSize - size);
+ }
+ break;
+ }
+ size += splitgLength;
+ }
+
+ }
+
+ InputSplitShim[] retArray = new InputSplitShim[retLists.size()];
+ return retLists.toArray(retArray);
+ }
+
+ /**
* Create a generic Hive RecordReader than can iterate over all chunks in a
* CombinedFileSplit.
*/
@@ -417,7 +510,7 @@ public class CombineHiveInputFormat<K ex
}
static class CombineFilter implements PathFilter {
- private List<String> pStrings = new ArrayList<String>();
+ private final List<String> pStrings = new ArrayList<String>();
// store a path prefix in this TestFilter
// PRECONDITION: p should always be a directory
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java Fri Apr 29 22:51:44 2011
@@ -22,8 +22,8 @@ import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
-import java.util.List;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@@ -32,15 +32,15 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
-import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
@@ -384,22 +384,34 @@ public final class HiveFileFormatUtils {
* @param aliasToWork The operator tree to be invoked for a given alias
* @param dir The path to look for
**/
- public static List<Operator<? extends Serializable>> doGetAliasesFromPath(
+ public static List<Operator<? extends Serializable>> doGetWorksFromPath(
Map<String, ArrayList<String>> pathToAliases,
Map<String, Operator<? extends Serializable>> aliasToWork, Path dir) {
List<Operator<? extends Serializable>> opList =
new ArrayList<Operator<? extends Serializable>>();
- if (pathToAliases == null) {
- return opList;
- }
- String path = getMatchingPath(pathToAliases, dir);
- List<String> aliases = pathToAliases.get(path);
+
+ List<String> aliases = doGetAliasesFromPath(pathToAliases, dir);
for (String alias : aliases) {
opList.add(aliasToWork.get(alias));
}
return opList;
}
+ /**
+ * Get the list of aliases from the opeerator tree that are needed for the path
+ * @param pathToAliases mapping from path to aliases
+ * @param dir The path to look for
+ **/
+ public static List<String> doGetAliasesFromPath(
+ Map<String, ArrayList<String>> pathToAliases,
+ Path dir) {
+ if (pathToAliases == null) {
+ return new ArrayList<String>();
+ }
+ String path = getMatchingPath(pathToAliases, dir);
+ return pathToAliases.get(path);
+ }
+
private HiveFileFormatUtils() {
// prevent instantiation
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java Fri Apr 29 22:51:44 2011
@@ -55,7 +55,6 @@ import org.apache.hadoop.hive.ql.parse.S
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory;
import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles;
-import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx;
import org.apache.hadoop.hive.ql.plan.ConditionalWork;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
@@ -72,6 +71,7 @@ import org.apache.hadoop.hive.ql.plan.Re
import org.apache.hadoop.hive.ql.plan.StatsWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
/**
@@ -425,7 +425,7 @@ public class GenMRFileSink1 implements N
aliases.add(inputDir); // dummy alias: just use the input path
// constructing the default MapredWork
- MapredWork cplan = GenMapRedUtils.getMapRedWork(conf);
+ MapredWork cplan = GenMapRedUtils.getMapRedWorkFromConf(conf);
cplan.getPathToAliases().put(inputDir, aliases);
cplan.getPathToPartitionInfo().put(inputDir, new PartitionDesc(tblDesc, null));
cplan.setNumReduceTasks(0);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRTableScan1.java Fri Apr 29 22:51:44 2011
@@ -33,11 +33,11 @@ import org.apache.hadoop.hive.ql.lib.Nod
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
-import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.QBParseInfo;
import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.StatsWork;
/**
@@ -63,7 +63,7 @@ public class GenMRTableScan1 implements
Map<Operator<? extends Serializable>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
// create a dummy MapReduce task
- MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+ MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
Task<? extends Serializable> currTask = TaskFactory.get(currWork, parseCtx.getConf());
Operator<? extends Serializable> currTopOp = op;
ctx.setCurrTask(currTask);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java Fri Apr 29 22:51:44 2011
@@ -28,7 +28,6 @@ import java.util.Stack;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
-import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.Task;
@@ -132,7 +131,7 @@ public class GenMRUnion1 implements Node
// union is encountered for the first time
if (uCtxTask == null) {
uCtxTask = new GenMRUnionCtx();
- uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+ uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
uTask = TaskFactory.get(uPlan, parseCtx.getConf());
uCtxTask.setUTask(uTask);
ctx.setUnionTask(union, uCtxTask);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java Fri Apr 29 22:51:44 2011
@@ -60,16 +60,16 @@ import org.apache.hadoop.hive.ql.parse.R
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
-import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
-import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
+import org.apache.hadoop.hive.ql.plan.MapredLocalWork.BucketMapJoinContext;
/**
* General utility common functions for the Processor to convert operator into
@@ -474,7 +474,7 @@ public final class GenMapRedUtils {
throws SemanticException {
// Generate a new task
ParseContext parseCtx = opProcCtx.getParseCtx();
- MapredWork cplan = getMapRedWork(parseCtx.getConf());
+ MapredWork cplan = getMapRedWork(parseCtx);
Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx
.getConf());
Operator<? extends Serializable> reducer = op.getChildOperators().get(0);
@@ -544,6 +544,8 @@ public final class GenMapRedUtils {
PrunedPartitionList partsList = pList;
+ plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
+
if (partsList == null) {
try {
partsList = parseCtx.getOpToPartList().get((TableScanOperator)topOp);
@@ -836,9 +838,21 @@ public final class GenMapRedUtils {
*
* @return the new plan
*/
- public static MapredWork getMapRedWork(HiveConf conf) {
+ public static MapredWork getMapRedWork(ParseContext parseCtx) {
+ MapredWork work = getMapRedWorkFromConf(parseCtx.getConf());
+ work.setNameToSplitSample(parseCtx.getNameToSplitSample());
+ return work;
+ }
+
+ /**
+ * create a new plan and return. The pan won't contain the name to split
+ * sample information in parse context.
+ *
+ * @return the new plan
+ */
+ public static MapredWork getMapRedWorkFromConf(HiveConf conf) {
MapredWork work = new MapredWork();
- // This code has been only added for testing
+
boolean mapperCannotSpanPartns =
conf.getBoolVar(
HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
@@ -1020,7 +1034,7 @@ public final class GenMapRedUtils {
// union is encountered for the first time
if (uCtxTask == null) {
uCtxTask = new GenMRUnionCtx();
- uPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+ uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
uTask = TaskFactory.get(uPlan, parseCtx.getConf());
uCtxTask.setUTask(uTask);
ctx.setUnionTask(union, uCtxTask);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinFactory.java Fri Apr 29 22:51:44 2011
@@ -127,7 +127,7 @@ public final class MapJoinFactory {
GenMRProcContext opProcCtx = (GenMRProcContext) procCtx;
ParseContext parseCtx = opProcCtx.getParseCtx();
- MapredWork cplan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+ MapredWork cplan = GenMapRedUtils.getMapRedWork(parseCtx);
Task<? extends Serializable> redTask = TaskFactory.get(cplan, parseCtx
.getConf());
Task<? extends Serializable> currTask = opProcCtx.getCurrTask();
@@ -211,7 +211,7 @@ public final class MapJoinFactory {
ctx.setMapJoinCtx(mapJoin, mjCtx);
}
- MapredWork mjPlan = GenMapRedUtils.getMapRedWork(parseCtx.getConf());
+ MapredWork mjPlan = GenMapRedUtils.getMapRedWork(parseCtx);
Task<? extends Serializable> mjTask = TaskFactory.get(mjPlan, parseCtx
.getConf());
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g Fri Apr 29 22:51:44 2011
@@ -173,7 +173,8 @@ TOK_ALTERTABLE_CLUSTER_SORT;
TOK_TABCOLNAME;
TOK_TABLELOCATION;
TOK_PARTITIONLOCATION;
-TOK_TABLESAMPLE;
+TOK_TABLEBUCKETSAMPLE;
+TOK_TABLESPLITSAMPLE;
TOK_TMP_FILE;
TOK_TABSORTCOLNAMEASC;
TOK_TABSORTCOLNAMEDESC;
@@ -1620,12 +1621,27 @@ fromSource
(tableSource | subQuerySource) (lateralView^)*
;
+tableBucketSample
+@init { msgs.push("table bucket sample specification"); }
+@after { msgs.pop(); }
+ :
+ KW_TABLESAMPLE LPAREN KW_BUCKET (numerator=Number) KW_OUT KW_OF (denominator=Number) (KW_ON expr+=expression (COMMA expr+=expression)*)? RPAREN -> ^(TOK_TABLEBUCKETSAMPLE $numerator $denominator $expr*)
+ ;
+
+splitSample
+@init { msgs.push("table split sample specification"); }
+@after { msgs.pop(); }
+ :
+ KW_TABLESAMPLE LPAREN (numerator=Number) KW_PERCENT RPAREN -> ^(TOK_TABLESPLITSAMPLE $numerator)
+ ;
+
tableSample
@init { msgs.push("table sample specification"); }
@after { msgs.pop(); }
:
- KW_TABLESAMPLE LPAREN KW_BUCKET (numerator=Number) KW_OUT KW_OF (denominator=Number) (KW_ON expr+=expression (COMMA expr+=expression)*)? RPAREN -> ^(TOK_TABLESAMPLE $numerator $denominator $expr*)
- ;
+ tableBucketSample |
+ splitSample
+ ;
tableSource
@init { msgs.push("table source"); }
@@ -2172,6 +2188,7 @@ KW_TABLESAMPLE: 'TABLESAMPLE';
KW_BUCKET: 'BUCKET';
KW_OUT: 'OUT';
KW_OF: 'OF';
+KW_PERCENT: 'PERCENT';
KW_CAST: 'CAST';
KW_ADD: 'ADD';
KW_REPLACE: 'REPLACE';
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java Fri Apr 29 22:51:44 2011
@@ -64,6 +64,7 @@ public class ParseContext {
private Map<JoinOperator, QBJoinTree> joinContext;
private Map<MapJoinOperator, QBJoinTree> mapJoinContext;
private HashMap<TableScanOperator, Table> topToTable;
+ private HashMap<String, SplitSample> nameToSplitSample;
private List<LoadTableDesc> loadTableWork;
private List<LoadFileDesc> loadFileWork;
private Context ctx;
@@ -146,7 +147,8 @@ public class ParseContext {
Map<GroupByOperator, Set<String>> groupOpToInputTables,
Map<String, PrunedPartitionList> prunedPartitions,
HashMap<TableScanOperator, sampleDesc> opToSamplePruner,
- SemanticAnalyzer.GlobalLimitCtx globalLimitCtx) {
+ SemanticAnalyzer.GlobalLimitCtx globalLimitCtx,
+ HashMap<String, SplitSample> nameToSplitSample) {
this.conf = conf;
this.qb = qb;
this.ast = ast;
@@ -169,6 +171,7 @@ public class ParseContext {
this.groupOpToInputTables = groupOpToInputTables;
this.prunedPartitions = prunedPartitions;
this.opToSamplePruner = opToSamplePruner;
+ this.nameToSplitSample = nameToSplitSample;
this.globalLimitCtx = globalLimitCtx;
}
@@ -318,6 +321,14 @@ public class ParseContext {
this.opParseCtx = opParseCtx;
}
+ public HashMap<String, SplitSample> getNameToSplitSample() {
+ return nameToSplitSample;
+ }
+
+ public void setNameToSplitSample(HashMap<String, SplitSample> nameToSplitSample) {
+ this.nameToSplitSample = nameToSplitSample;
+ }
+
/**
* @return the loadTableWork
*/
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Fri Apr 29 22:51:44 2011
@@ -70,6 +70,7 @@ import org.apache.hadoop.hive.ql.exec.Un
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
+import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat;
@@ -187,6 +188,11 @@ public class SemanticAnalyzer extends Ba
private UnionProcContext uCtx;
List<AbstractMapJoinOperator<? extends MapJoinDesc>> listMapJoinOpsNoReducer;
private HashMap<TableScanOperator, sampleDesc> opToSamplePruner;
+ /**
+ * a map for the split sampling, from ailias to an instance of SplitSample
+ * that describes percentage and number.
+ */
+ private final HashMap<String, SplitSample> nameToSplitSample;
Map<GroupByOperator, Set<String>> groupOpToInputTables;
Map<String, PrunedPartitionList> prunedPartitions;
private List<FieldSchema> resultSchema;
@@ -249,6 +255,7 @@ public class SemanticAnalyzer extends Ba
opToPartPruner = new HashMap<TableScanOperator, ExprNodeDesc>();
opToPartList = new HashMap<TableScanOperator, PrunedPartitionList>();
opToSamplePruner = new HashMap<TableScanOperator, sampleDesc>();
+ nameToSplitSample = new HashMap<String, SplitSample>();
topOps = new HashMap<String, Operator<? extends Serializable>>();
topSelOps = new HashMap<String, Operator<? extends Serializable>>();
loadTableWork = new ArrayList<LoadTableDesc>();
@@ -308,7 +315,7 @@ public class SemanticAnalyzer extends Ba
topSelOps, opParseCtx, joinContext, topToTable, loadTableWork,
loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
- opToSamplePruner, globalLimitCtx);
+ opToSamplePruner, globalLimitCtx, nameToSplitSample);
}
@SuppressWarnings("nls")
@@ -428,21 +435,30 @@ public class SemanticAnalyzer extends Ba
// and the alias (if alias is not present, the table name
// is used as an alias)
boolean tableSamplePresent = false;
+ boolean splitSamplePresent = false;
+
int aliasIndex = 0;
if (tabref.getChildCount() == 2) {
// tablename tablesample
// OR
// tablename alias
ASTNode ct = (ASTNode) tabref.getChild(1);
- if (ct.getToken().getType() == HiveParser.TOK_TABLESAMPLE) {
+ if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) {
tableSamplePresent = true;
+ } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) {
+ splitSamplePresent = true;
} else {
aliasIndex = 1;
}
} else if (tabref.getChildCount() == 3) {
// table name table sample alias
aliasIndex = 2;
- tableSamplePresent = true;
+ ASTNode ct = (ASTNode) tabref.getChild(1);
+ if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) {
+ tableSamplePresent = true;
+ } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) {
+ splitSamplePresent = true;
+ }
}
ASTNode tableTree = (ASTNode) (tabref.getChild(0));
@@ -482,6 +498,23 @@ public class SemanticAnalyzer extends Ba
.getChild(0));
}
}
+ } else if (splitSamplePresent) {
+ // only CombineHiveInputFormat supports this optimize
+ String inputFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEINPUTFORMAT);
+ if (!inputFormat.equals(
+ CombineHiveInputFormat.class.getName())) {
+ throw new SemanticException(
+ "Percentage sampling is not supported in " + inputFormat);
+ }
+ ASTNode sampleClause = (ASTNode) tabref.getChild(1);
+ String alias_id = getAliasId(alias, qb);
+ String strPercentage = unescapeIdentifier(sampleClause.getChild(0).getText());
+ Double percent = Double.valueOf(strPercentage).doubleValue();
+ if (percent < 0 || percent > 100) {
+ throw new SemanticException("Sampling percentage should be between 0 and 100.");
+ }
+ nameToSplitSample.put(alias_id, new SplitSample(
+ percent, conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM)));
}
// Insert this map into the stats
qb.setTabAlias(alias, tabIdName);
@@ -5759,10 +5792,14 @@ public class SemanticAnalyzer extends Ba
return equalsExpr;
}
+ private String getAliasId(String alias, QB qb) {
+ return (qb.getId() == null ? alias : qb.getId() + ":" + alias);
+ }
+
@SuppressWarnings("nls")
private Operator genTablePlan(String alias, QB qb) throws SemanticException {
- String alias_id = (qb.getId() == null ? alias : qb.getId() + ":" + alias);
+ String alias_id = getAliasId(alias, qb);
Table tab = qb.getMetaData().getSrcForAlias(alias);
RowResolver rwsch;
@@ -6356,17 +6393,19 @@ public class SemanticAnalyzer extends Ba
// determine the query qualifies reduce input size for LIMIT
// The query only qualifies when there are only one top operator
- // and there is no transformer or UDTF
+ // and there is no transformer or UDTF and no block sampling
+ // is used.
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVELIMITOPTENABLE)
&& ctx.getTryCount() == 0 && topOps.size() == 1
- && !globalLimitCtx.ifHasTransformOrUDTF()) {
+ && !globalLimitCtx.ifHasTransformOrUDTF() &&
+ nameToSplitSample.isEmpty()) {
// Here we recursively check:
// 1. whether there are exact one LIMIT in the query
// 2. whether there is no aggregation, group-by, distinct, sort by,
// distributed by, or table sampling in any of the sub-query.
// The query only qualifies if both conditions are satisfied.
- //
+ //
// Example qualified queries:
// CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
// INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
@@ -6817,7 +6856,7 @@ public class SemanticAnalyzer extends Ba
opToPartList, topOps, topSelOps, opParseCtx, joinContext, topToTable,
loadTableWork, loadFileWork, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, groupOpToInputTables, prunedPartitions,
- opToSamplePruner, globalLimitCtx);
+ opToSamplePruner, globalLimitCtx, nameToSplitSample);
Optimizer optm = new Optimizer();
optm.setPctx(pCtx);
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SplitSample.java Fri Apr 29 22:51:44 2011
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+import java.io.Serializable;
+
+import org.apache.hadoop.hive.ql.plan.Explain;
+
+
+
+/**
+ *
+ * This class stores all the information specified in the TABLESAMPLE(PERCENT ...) clause.
+ * e.g. for the clause "FROM t TABLESAMPLE(1 PERCENT) it will store the percentage 1,
+ * and the seed number is to determine which 1%. Currently it is from the conf
+ * hive.sample.seednumber
+ *
+ */
+public class SplitSample implements Serializable{
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * The percentage of the TABLESAMPLE clause.
+ */
+ private double percent;
+
+ /**
+ * The number used to determine which part of the input to sample
+ */
+ private int seedNum = 0;
+
+ public SplitSample() {
+ }
+
+
+ public SplitSample(double percent, int seedNum) {
+ this.percent = percent;
+ this.seedNum = seedNum;
+ }
+
+ @Explain(displayName = "percentage")
+ public double getPercent() {
+ return percent;
+ }
+
+ public void setPercent(double percent) {
+ this.percent = percent;
+ }
+
+ @Explain(displayName = "seed number")
+ public int getSeedNum() {
+ return seedNum;
+ }
+
+ public void setSeedNum(int seedNum) {
+ this.seedNum = seedNum;
+ }
+
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java Fri Apr 29 22:51:44 2011
@@ -21,6 +21,7 @@ package org.apache.hadoop.hive.ql.plan;
import java.io.ByteArrayOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -29,6 +30,7 @@ import org.apache.hadoop.hive.ql.exec.Op
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.parse.OpParseContext;
import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SplitSample;
/**
* MapredWork.
@@ -49,6 +51,8 @@ public class MapredWork implements Seria
private LinkedHashMap<String, PartitionDesc> aliasToPartnInfo;
+ private HashMap<String, SplitSample> nameToSplitSample;
+
// map<->reduce interface
// schema of the map-reduce 'key' object - this is homogeneous
private TableDesc keyDesc;
@@ -201,6 +205,15 @@ public class MapredWork implements Seria
return reducer;
}
+ @Explain(displayName = "Percentage Sample")
+ public HashMap<String, SplitSample> getNameToSplitSample() {
+ return nameToSplitSample;
+ }
+
+ public void setNameToSplitSample(HashMap<String, SplitSample> nameToSplitSample) {
+ this.nameToSplitSample = nameToSplitSample;
+ }
+
public void setReducer(final Operator<?> reducer) {
this.reducer = reducer;
}
Added: hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/split_sample_out_of_range.q Fri Apr 29 22:51:44 2011
@@ -0,0 +1,3 @@
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+
+select key from src tablesample(105 percent);
Added: hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/split_sample_wrong_format.q Fri Apr 29 22:51:44 2011
@@ -0,0 +1,3 @@
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+
+select key from src tablesample(1 percent);
Added: hive/trunk/ql/src/test/queries/clientpositive/split_sample.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/split_sample.q?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/split_sample.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/split_sample.q Fri Apr 29 22:51:44 2011
@@ -0,0 +1,86 @@
+drop table ss_src1;
+drop table ss_src2;
+drop table ss_src3;
+drop table ss_i_part;
+drop table ss_t3;
+drop table ss_t4;
+drop table ss_t5;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+set mapred.max.split.size=300;
+set mapred.min.split.size=300;
+set mapred.min.split.size.per.node=300;
+set mapred.min.split.size.per.rack=300;
+set hive.merge.smallfiles.avgsize=1;
+
+-- create multiple file inputs (two enable multiple splits)
+create table ss_i_part (key int, value string) partitioned by (p string);
+insert overwrite table ss_i_part partition (p='1') select key, value from src;
+insert overwrite table ss_i_part partition (p='2') select key, value from src;
+insert overwrite table ss_i_part partition (p='3') select key, value from src;
+create table ss_src2 as select key, value from ss_i_part;
+select count(1) from ss_src2 tablesample(1 percent);
+
+-- sample first split
+desc ss_src2;
+set hive.sample.seednumber=0;
+explain select key, value from ss_src2 tablesample(1 percent) limit 10;
+select key, value from ss_src2 tablesample(1 percent) limit 10;
+
+-- verify seed number of sampling
+insert overwrite table ss_i_part partition (p='1') select key+10000, value from src;
+insert overwrite table ss_i_part partition (p='2') select key+20000, value from src;
+insert overwrite table ss_i_part partition (p='3') select key+30000, value from src;
+create table ss_src3 as select key, value from ss_i_part;
+set hive.sample.seednumber=3;
+create table ss_t3 as select sum(key) % 397 as s from ss_src3 tablesample(1 percent) limit 10;
+set hive.sample.seednumber=4;
+create table ss_t4 as select sum(key) % 397 as s from ss_src3 tablesample(1 percent) limit 10;
+set hive.sample.seednumber=5;
+create table ss_t5 as select sum(key) % 397 as s from ss_src3 tablesample(1 percent) limit 10;
+select sum(s) from (select s from ss_t3 union all select s from ss_t4 union all select s from ss_t5) t;
+
+-- sample more than one split
+explain select count(distinct key) from ss_src2 tablesample(70 percent) limit 10;
+select count(distinct key) from ss_src2 tablesample(70 percent) limit 10;
+
+-- sample all splits
+select count(1) from ss_src2 tablesample(100 percent);
+
+-- subquery
+explain select key from (select key from ss_src2 tablesample(1 percent) limit 10) subq;
+select key from (select key from ss_src2 tablesample(1 percent) limit 10) subq;
+
+-- groupby
+select key, count(1) from ss_src2 tablesample(1 percent) group by key order by key;
+
+-- sample one of two tables:
+create table ss_src1 as select * from ss_src2;
+select t2.key as k from ss_src1 join ss_src2 tablesample(1 percent) t2 on ss_src1.key=t2.key order by k;
+
+-- sample two tables
+explain select * from (
+select t1.key as k1, t2.key as k from ss_src1 tablesample(80 percent) t1 full outer join ss_src2 tablesample(2 percent) t2 on t1.key=t2.key
+) subq where k in (199, 10199, 20199) or k1 in (199, 10199, 20199);
+
+select * from (
+select t1.key as k1, t2.key as k from ss_src1 tablesample(80 percent) t1 full outer join ss_src2 tablesample(2 percent) t2 on t1.key=t2.key
+) subq where k in (199, 10199, 20199) or k1 in (199, 10199, 20199);
+
+-- shrink last split
+explain select count(1) from ss_src2 tablesample(1 percent);
+set mapred.max.split.size=300000;
+set mapred.min.split.size=300000;
+set mapred.min.split.size.per.node=300000;
+set mapred.min.split.size.per.rack=300000;
+select count(1) from ss_src2 tablesample(1 percent);
+select count(1) from ss_src2 tablesample(50 percent);
+
+
+drop table ss_src1;
+drop table ss_src2;
+drop table ss_src3;
+drop table ss_i_part;
+drop table ss_t3;
+drop table ss_t4;
+drop table ss_t5;
Added: hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/split_sample_out_of_range.q.out Fri Apr 29 22:51:44 2011
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: Sampling percentage should be between 0 and 100.
Added: hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out?rev=1097980&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/split_sample_wrong_format.q.out Fri Apr 29 22:51:44 2011
@@ -0,0 +1 @@
+FAILED: Error in semantic analysis: Percentage sampling is not supported in org.apache.hadoop.hive.ql.io.HiveInputFormat
Modified: hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out Fri Apr 29 22:51:44 2011
@@ -65,7 +65,7 @@ STAGE PLANS:
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297328964
+ transient_lastDdlTime 1303258260
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -81,7 +81,7 @@ STAGE PLANS:
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297328964
+ transient_lastDdlTime 1303258260
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.src
name: default.src
@@ -97,9 +97,9 @@ STAGE PLANS:
File Output Operator
compressed: false
GlobalTableId: 1
- directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000
+ directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000
NumFilesPerFileSink: 1
- Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000/
+ Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000/
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -115,7 +115,7 @@ STAGE PLANS:
serialization.ddl struct bucket1_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297330232
+ transient_lastDdlTime 1303259753
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket1_1
TotalFiles: 1
@@ -126,7 +126,7 @@ STAGE PLANS:
Move Operator
tables:
replace: true
- source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000
+ source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -142,14 +142,14 @@ STAGE PLANS:
serialization.ddl struct bucket1_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297330232
+ transient_lastDdlTime 1303259753
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket1_1
- tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10001
+ tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10001
Stage: Stage-2
Stats-Aggr Operator
- Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-32_715_8063819901877770761/-ext-10000/
+ Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-35-53_792_7618040586201295812/-ext-10000/
PREHOOK: query: insert overwrite table bucket1_1
@@ -167,11 +167,11 @@ POSTHOOK: Lineage: bucket1_1.value SIMPL
PREHOOK: query: select * from bucket1_1 order by key
PREHOOK: type: QUERY
PREHOOK: Input: default@bucket1_1
-PREHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-37_416_6808991169590499902/-mr-10000
+PREHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-35-58_044_385144993656634200/-mr-10000
POSTHOOK: query: select * from bucket1_1 order by key
POSTHOOK: type: QUERY
POSTHOOK: Input: default@bucket1_1
-POSTHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-37_416_6808991169590499902/-mr-10000
+POSTHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-35-58_044_385144993656634200/-mr-10000
POSTHOOK: Lineage: bucket1_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket1_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
0 val_0
Modified: hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out Fri Apr 29 22:51:44 2011
@@ -65,7 +65,7 @@ STAGE PLANS:
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297328964
+ transient_lastDdlTime 1303258260
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -81,7 +81,7 @@ STAGE PLANS:
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297328964
+ transient_lastDdlTime 1303258260
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.src
name: default.src
@@ -97,9 +97,9 @@ STAGE PLANS:
File Output Operator
compressed: false
GlobalTableId: 1
- directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000
+ directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000
NumFilesPerFileSink: 2
- Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000/
+ Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000/
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -115,7 +115,7 @@ STAGE PLANS:
serialization.ddl struct bucket2_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297330241
+ transient_lastDdlTime 1303259761
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket2_1
TotalFiles: 2
@@ -126,7 +126,7 @@ STAGE PLANS:
Move Operator
tables:
replace: true
- source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000
+ source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -142,14 +142,14 @@ STAGE PLANS:
serialization.ddl struct bucket2_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297330241
+ transient_lastDdlTime 1303259761
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket2_1
- tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10001
+ tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10001
Stage: Stage-2
Stats-Aggr Operator
- Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-41_276_6827868533448324012/-ext-10000/
+ Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-01_663_7003431130306835352/-ext-10000/
PREHOOK: query: insert overwrite table bucket2_1
@@ -173,7 +173,7 @@ POSTHOOK: type: QUERY
POSTHOOK: Lineage: bucket2_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket2_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
ABSTRACT SYNTAX TREE:
- (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket2_1) (TOK_TABLESAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket2_1) (TOK_TABLEBUCKETSAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
STAGE DEPENDENCIES:
Stage-1 is a root stage
@@ -229,11 +229,11 @@ STAGE PLANS:
PREHOOK: query: select * from bucket2_1 tablesample (bucket 1 out of 2) s order by key
PREHOOK: type: QUERY
PREHOOK: Input: default@bucket2_1
-PREHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-46_160_2130891703682723511/-mr-10000
+PREHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-06_239_4661085987132263426/-mr-10000
POSTHOOK: query: select * from bucket2_1 tablesample (bucket 1 out of 2) s order by key
POSTHOOK: type: QUERY
POSTHOOK: Input: default@bucket2_1
-POSTHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-30-46_160_2130891703682723511/-mr-10000
+POSTHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-06_239_4661085987132263426/-mr-10000
POSTHOOK: Lineage: bucket2_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket2_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
0 val_0
Modified: hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out Fri Apr 29 22:51:44 2011
@@ -65,7 +65,7 @@ STAGE PLANS:
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297328964
+ transient_lastDdlTime 1303258260
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -81,7 +81,7 @@ STAGE PLANS:
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297328964
+ transient_lastDdlTime 1303258260
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.src
name: default.src
@@ -97,10 +97,10 @@ STAGE PLANS:
File Output Operator
compressed: false
GlobalTableId: 1
- directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000
+ directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000
NumFilesPerFileSink: 2
Static Partition Specification: ds=1/
- Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000/
+ Stats Publishing Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000/
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -117,7 +117,7 @@ STAGE PLANS:
serialization.ddl struct bucket3_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297330250
+ transient_lastDdlTime 1303259769
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket3_1
TotalFiles: 2
@@ -130,7 +130,7 @@ STAGE PLANS:
partition:
ds 1
replace: true
- source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000
+ source: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -147,14 +147,14 @@ STAGE PLANS:
serialization.ddl struct bucket3_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297330250
+ transient_lastDdlTime 1303259769
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket3_1
- tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10001
+ tmp directory: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10001
Stage: Stage-2
Stats-Aggr Operator
- Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-02-10_01-30-50_887_7521360143673656649/-ext-10000/
+ Stats Aggregation Key Prefix: pfile:/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-19_17-36-09_607_3764667041010338952/-ext-10000/
PREHOOK: query: insert overwrite table bucket3_1 partition (ds='1')
@@ -194,7 +194,7 @@ POSTHOOK: Lineage: bucket3_1 PARTITION(d
POSTHOOK: Lineage: bucket3_1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket3_1 PARTITION(ds=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
ABSTRACT SYNTAX TREE:
- (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket3_1) (TOK_TABLESAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket3_1) (TOK_TABLEBUCKETSAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL ds) '1')) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL key)))))
STAGE DEPENDENCIES:
Stage-1 is a root stage
@@ -258,11 +258,11 @@ STAGE PLANS:
PREHOOK: query: select * from bucket3_1 tablesample (bucket 1 out of 2) s where ds = '1' order by key
PREHOOK: type: QUERY
PREHOOK: Input: default@bucket3_1@ds=1
-PREHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-31-00_746_8025378428702145900/-mr-10000
+PREHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-18_942_3752538522863517107/-mr-10000
POSTHOOK: query: select * from bucket3_1 tablesample (bucket 1 out of 2) s where ds = '1' order by key
POSTHOOK: type: QUERY
POSTHOOK: Input: default@bucket3_1@ds=1
-POSTHOOK: Output: file:/tmp/sdong/hive_2011-02-10_01-31-00_746_8025378428702145900/-mr-10000
+POSTHOOK: Output: file:/tmp/sdong/hive_2011-04-19_17-36-18_942_3752538522863517107/-mr-10000
POSTHOOK: Lineage: bucket3_1 PARTITION(ds=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket3_1 PARTITION(ds=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: bucket3_1 PARTITION(ds=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
Modified: hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out?rev=1097980&r1=1097979&r2=1097980&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out Fri Apr 29 22:51:44 2011
@@ -50,9 +50,9 @@ STAGE PLANS:
type: string
Needs Tagging: false
Path -> Alias:
- hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src [src]
+ hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src [src]
Path -> Partition:
- hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src
+ hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src
Partition
base file name: src
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -63,12 +63,12 @@ STAGE PLANS:
columns.types string:string
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src
+ location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src
name default.src
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297926642
+ transient_lastDdlTime 1304060620
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
@@ -79,12 +79,12 @@ STAGE PLANS:
columns.types string:string
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/src
+ location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/src
name default.src
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297926642
+ transient_lastDdlTime 1304060620
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.src
name: default.src
@@ -100,9 +100,9 @@ STAGE PLANS:
File Output Operator
compressed: false
GlobalTableId: 1
- directory: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000
+ directory: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000
NumFilesPerFileSink: 2
- Stats Publishing Key Prefix: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000/
+ Stats Publishing Key Prefix: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000/
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -114,12 +114,12 @@ STAGE PLANS:
columns.types int:string
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/bucket4_1
+ location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/bucket4_1
name default.bucket4_1
serialization.ddl struct bucket4_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297926644
+ transient_lastDdlTime 1304060621
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket4_1
TotalFiles: 2
@@ -130,7 +130,7 @@ STAGE PLANS:
Move Operator
tables:
replace: true
- source: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000
+ source: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
@@ -142,19 +142,19 @@ STAGE PLANS:
columns.types int:string
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- location hdfs://localhost.localdomain:38821/build/ql/test/data/warehouse/bucket4_1
+ location hdfs://localhost.localdomain:54445/build/ql/test/data/warehouse/bucket4_1
name default.bucket4_1
serialization.ddl struct bucket4_1 { i32 key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- transient_lastDdlTime 1297926644
+ transient_lastDdlTime 1304060621
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.bucket4_1
- tmp directory: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10001
+ tmp directory: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10001
Stage: Stage-2
Stats-Aggr Operator
- Stats Aggregation Key Prefix: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-10-44_106_4588433479891768182/-ext-10000/
+ Stats Aggregation Key Prefix: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-03-41_604_6168847035474103606/-ext-10000/
PREHOOK: query: insert overwrite table bucket4_1
@@ -178,7 +178,7 @@ POSTHOOK: type: QUERY
POSTHOOK: Lineage: bucket4_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket4_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
ABSTRACT SYNTAX TREE:
- (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket4_1) (TOK_TABLESAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME bucket4_1) (TOK_TABLEBUCKETSAMPLE 1 2) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
STAGE DEPENDENCIES:
Stage-1 is a root stage
@@ -221,11 +221,11 @@ STAGE PLANS:
PREHOOK: query: select * from bucket4_1 tablesample (bucket 1 out of 2) s
PREHOOK: type: QUERY
PREHOOK: Input: default@bucket4_1
-PREHOOK: Output: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-11-09_440_4231562110595025861/-mr-10000
+PREHOOK: Output: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-04-06_999_3814228385258280857/-mr-10000
POSTHOOK: query: select * from bucket4_1 tablesample (bucket 1 out of 2) s
POSTHOOK: type: QUERY
POSTHOOK: Input: default@bucket4_1
-POSTHOOK: Output: hdfs://localhost.localdomain:38821/data/users/sdong/www/open-source-hive3/build/ql/scratchdir/hive_2011-02-16_23-11-09_440_4231562110595025861/-mr-10000
+POSTHOOK: Output: hdfs://localhost.localdomain:54445/data/users/sdong/www/open-source-hive1/build/ql/scratchdir/hive_2011-04-29_00-04-06_999_3814228385258280857/-mr-10000
POSTHOOK: Lineage: bucket4_1.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: bucket4_1.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
0 val_0