You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/12/05 12:59:26 UTC
svn commit: r1417374 [2/11] - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/common/
common/src/java/org/apache/hadoop/hive/conf/ conf/
ql/src/java/org/apache/hadoop/hive/ql/
ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/ha...
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java Wed Dec 5 11:59:15 2012
@@ -35,7 +35,6 @@ import org.antlr.runtime.tree.Tree;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.Context;
@@ -56,8 +55,9 @@ import org.apache.hadoop.hive.ql.metadat
import org.apache.hadoop.hive.ql.metadata.InvalidTableException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPrunerUtils;
+import org.apache.hadoop.hive.ql.plan.ListBucketingCtx;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
-import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe;
@@ -915,6 +915,28 @@ public abstract class BaseSemanticAnalyz
}
/**
+ * Construct list bucketing context.
+ *
+ * @param skewedColNames
+ * @param skewedValues
+ * @param skewedColValueLocationMaps
+ * @param isStoredAsSubDirectories
+ * @return
+ */
+ protected ListBucketingCtx constructListBucketingCtx(List<String> skewedColNames,
+ List<List<String>> skewedValues, Map<List<String>, String> skewedColValueLocationMaps,
+ boolean isStoredAsSubDirectories, HiveConf conf) {
+ ListBucketingCtx lbCtx = new ListBucketingCtx();
+ lbCtx.setSkewedColNames(skewedColNames);
+ lbCtx.setSkewedColValues(skewedValues);
+ lbCtx.setLbLocationMap(skewedColValueLocationMaps);
+ lbCtx.setStoredAsSubDirectories(isStoredAsSubDirectories);
+ lbCtx.setDefaultKey(ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_KEY);
+ lbCtx.setDefaultDirName(ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME);
+ return lbCtx;
+ }
+
+ /**
* Given a ASTNode, return list of values.
*
* use case:
@@ -1036,4 +1058,5 @@ public abstract class BaseSemanticAnalyz
}
return storedAsDirs;
}
+
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/DDLSemanticAnalyzer.java Wed Dec 5 11:59:15 2012
@@ -18,13 +18,8 @@
package org.apache.hadoop.hive.ql.parse;
-import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_CASCADE;
-import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_DATABASECOMMENT;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_DATABASELOCATION;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_DATABASEPROPERTIES;
-import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_IFEXISTS;
-import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_IFNOTEXISTS;
-import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_SHOWDATABASES;
import java.io.Serializable;
import java.net.URI;
@@ -94,6 +89,7 @@ import org.apache.hadoop.hive.ql.plan.Dr
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.GrantDesc;
import org.apache.hadoop.hive.ql.plan.GrantRevokeRoleDDL;
+import org.apache.hadoop.hive.ql.plan.ListBucketingCtx;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.LockTableDesc;
import org.apache.hadoop.hive.ql.plan.MoveWork;
@@ -1217,6 +1213,7 @@ public class DDLSemanticAnalyzer extends
Path oldTblPartLoc = null;
Path newTblPartLoc = null;
Table tblObj = null;
+ ListBucketingCtx lbCtx = null;
try {
tblObj = db.getTable(tableName);
@@ -1258,6 +1255,9 @@ public class DDLSemanticAnalyzer extends
.getAuthority(), partPath.toUri().getPath());
oldTblPartLoc = partPath;
+
+ lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(),
+ part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
}
} else {
inputFormatClass = tblObj.getInputFormatClass();
@@ -1266,6 +1266,9 @@ public class DDLSemanticAnalyzer extends
// input and output are the same
oldTblPartLoc = tblObj.getPath();
newTblPartLoc = tblObj.getPath();
+
+ lbCtx = constructListBucketingCtx(tblObj.getSkewedColNames(), tblObj.getSkewedColValues(),
+ tblObj.getSkewedColValueLocationMaps(), tblObj.isStoredAsSubDirectories(), conf);
}
// throw a HiveException for non-rcfile.
@@ -1290,6 +1293,8 @@ public class DDLSemanticAnalyzer extends
mergeDesc.setInputDir(inputDir);
+ mergeDesc.setLbCtx(lbCtx);
+
addInputsOutputsAlterTable(tableName, partSpec);
DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), mergeDesc);
ddlWork.setNeedLock(true);
@@ -1299,6 +1304,7 @@ public class DDLSemanticAnalyzer extends
mergeDesc.setOutputDir(queryTmpdir);
LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, queryTmpdir, tblDesc,
partSpec == null ? new HashMap<String, String>() : partSpec);
+ ltd.setLbCtx(lbCtx);
Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false),
conf);
mergeTask.addDependentTask(moveTsk);
@@ -2711,9 +2717,6 @@ public class DDLSemanticAnalyzer extends
* hive.internal.ddl.list.bucketing.enable set to false.
*/
HiveConf hiveConf = SessionState.get().getConf();
- if (!(hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_INTERNAL_DDL_LIST_BUCKETING_ENABLE))) {
- throw new SemanticException(ErrorMsg.HIVE_INTERNAL_DDL_LIST_BUCKETING_DISABLED.getMsg());
- }
String tableName = getUnescapedName((ASTNode) ast.getChild(0));
Table tab = null;
@@ -2863,9 +2866,6 @@ public class DDLSemanticAnalyzer extends
* hive.internal.ddl.list.bucketing.enable set to false.
*/
HiveConf hiveConf = SessionState.get().getConf();
- if (!(hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_INTERNAL_DDL_LIST_BUCKETING_ENABLE))) {
- throw new SemanticException(ErrorMsg.HIVE_INTERNAL_DDL_LIST_BUCKETING_DISABLED.getMsg());
- }
/**
* Retrieve mappings from parser
*/
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java Wed Dec 5 11:59:15 2012
@@ -141,6 +141,7 @@ import org.apache.hadoop.hive.ql.plan.Jo
import org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc;
import org.apache.hadoop.hive.ql.plan.LateralViewJoinDesc;
import org.apache.hadoop.hive.ql.plan.LimitDesc;
+import org.apache.hadoop.hive.ql.plan.ListBucketingCtx;
import org.apache.hadoop.hive.ql.plan.LoadFileDesc;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
@@ -4492,6 +4493,7 @@ public class SemanticAnalyzer extends Ba
DynamicPartitionCtx dpCtx = null;
LoadTableDesc ltd = null;
boolean holdDDLTime = checkHoldDDLTime(qb);
+ ListBucketingCtx lbCtx = null;
switch (dest_type.intValue()) {
case QBMetaData.DEST_TABLE: {
@@ -4579,6 +4581,10 @@ public class SemanticAnalyzer extends Ba
currentTableId = destTableId;
destTableId++;
+ lbCtx = constructListBucketingCtx(dest_tab.getSkewedColNames(),
+ dest_tab.getSkewedColValues(), dest_tab.getSkewedColValueLocationMaps(),
+ dest_tab.isStoredAsSubDirectories(), conf);
+
// Create the work for moving the table
// NOTE: specify Dynamic partitions in dest_tab for WriteEntity
if (!isNonNativeTable) {
@@ -4586,6 +4592,7 @@ public class SemanticAnalyzer extends Ba
table_desc, dpCtx);
ltd.setReplace(!qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(),
dest_tab.getTableName()));
+ ltd.setLbCtx(lbCtx);
if (holdDDLTime) {
LOG.info("this query will not update transient_lastDdlTime!");
@@ -4655,10 +4662,14 @@ public class SemanticAnalyzer extends Ba
currentTableId = destTableId;
destTableId++;
+ lbCtx = constructListBucketingCtx(dest_part.getSkewedColNames(),
+ dest_part.getSkewedColValues(), dest_part.getSkewedColValueLocationMaps(),
+ dest_part.isStoredAsSubDirectories(), conf);
ltd = new LoadTableDesc(queryTmpdir, ctx.getExternalTmpFileURI(dest_path.toUri()),
table_desc, dest_part.getSpec());
ltd.setReplace(!qb.getParseInfo().isInsertIntoTable(dest_tab.getDbName(),
dest_tab.getTableName()));
+ ltd.setLbCtx(lbCtx);
if (holdDDLTime) {
try {
@@ -4832,6 +4843,13 @@ public class SemanticAnalyzer extends Ba
rsCtx.getPartnCols(),
dpCtx);
+ /* Set List Bucketing context. */
+ if (lbCtx != null) {
+ lbCtx.processRowSkewedIndex(fsRS);
+ lbCtx.calculateSkewedValueSubDirList();
+ }
+ fileSinkDesc.setLbCtx(lbCtx);
+
// set the stats publishing/aggregating key prefix
// the same as directory name. The directory name
// can be changed in the optimizer but the key should not be changed
@@ -4865,7 +4883,6 @@ public class SemanticAnalyzer extends Ba
return output;
}
-
/**
* Generate the conversion SelectOperator that converts the columns into the
* types that are expected by the table_desc.
@@ -8705,9 +8722,6 @@ public class SemanticAnalyzer extends Ba
* hive.internal.ddl.list.bucketing.enable set to false.
*/
HiveConf hiveConf = SessionState.get().getConf();
- if (!(hiveConf.getBoolVar(HiveConf.ConfVars.HIVE_INTERNAL_DDL_LIST_BUCKETING_ENABLE))) {
- throw new SemanticException(ErrorMsg.HIVE_INTERNAL_DDL_LIST_BUCKETING_DISABLED.getMsg());
- }
// skewed column names
skewedColNames = analyzeSkewedTablDDLColNames(skewedColNames, child);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java Wed Dec 5 11:59:15 2012
@@ -53,6 +53,7 @@ public class ConditionalResolverMergeFil
List<Task<? extends Serializable>> listTasks;
private String dir;
private DynamicPartitionCtx dpCtx; // merge task could be after dynamic partition insert
+ private ListBucketingCtx lbCtx;
public ConditionalResolverMergeFilesCtx() {
}
@@ -103,6 +104,20 @@ public class ConditionalResolverMergeFil
public void setDPCtx(DynamicPartitionCtx dp) {
dpCtx = dp;
}
+
+ /**
+ * @return the lbCtx
+ */
+ public ListBucketingCtx getLbCtx() {
+ return lbCtx;
+ }
+
+ /**
+ * @param lbCtx the lbCtx to set
+ */
+ public void setLbCtx(ListBucketingCtx lbCtx) {
+ this.lbCtx = lbCtx;
+ }
}
public List<Task<? extends Serializable>> getTasks(HiveConf conf,
@@ -131,104 +146,39 @@ public class ConditionalResolverMergeFil
// For each dynamic partition, check if it needs to be merged.
MapredWork work = (MapredWork) mrTask.getWork();
+ int lbLevel = (ctx.getLbCtx() == null) ? 0 : ctx.getLbCtx().calculateListBucketingLevel();
+
+ /**
+ * In order to make code easier to read, we write the following in the way:
+ * 1. the first if clause to differ dynamic partition and static partition
+ * 2. with static partition, we differ list bucketing from non-list bucketing.
+ * Another way to write it is to merge static partition w/ LB wit DP. In that way,
+ * we still need to further differ them, since one uses lbLevel and
+ * another lbLevel+numDPCols.
+ * The first one is selected mainly for easy to read.
+ */
// Dynamic partition: replace input path (root to dp paths) with dynamic partition
// input paths.
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
+ int numDPCols = dpCtx.getNumDPCols();
+ int dpLbLevel = numDPCols + lbLevel;
- // get list of dynamic partitions
- FileStatus[] status = Utilities.getFileStatusRecurse(dirPath,
- dpCtx.getNumDPCols(), inpFs);
-
- // cleanup pathToPartitionInfo
- Map<String, PartitionDesc> ptpi = work.getPathToPartitionInfo();
- assert ptpi.size() == 1;
- String path = ptpi.keySet().iterator().next();
- TableDesc tblDesc = ptpi.get(path).getTableDesc();
- ptpi.remove(path); // the root path is not useful anymore
-
- // cleanup pathToAliases
- Map<String, ArrayList<String>> pta = work.getPathToAliases();
- assert pta.size() == 1;
- path = pta.keySet().iterator().next();
- ArrayList<String> aliases = pta.get(path);
- pta.remove(path); // the root path is not useful anymore
-
- // populate pathToPartitionInfo and pathToAliases w/ DP paths
- long totalSz = 0;
- boolean doMerge = false;
- // list of paths that don't need to merge but need to move to the dest location
- List<String> toMove = new ArrayList<String>();
- for (int i = 0; i < status.length; ++i) {
- long len = getMergeSize(inpFs, status[i].getPath(), avgConditionSize);
- if (len >= 0) {
- doMerge = true;
- totalSz += len;
- Map<String, String> fullPartSpec = new LinkedHashMap<String, String>(
- dpCtx.getPartSpec());
- Warehouse.makeSpecFromName(fullPartSpec, status[i].getPath());
- PartitionDesc pDesc = new PartitionDesc(tblDesc, (LinkedHashMap) fullPartSpec);
-
- work.resolveDynamicPartitionMerge(conf, status[i].getPath(), tblDesc,
- aliases, pDesc);
- } else {
- toMove.add(status[i].getPath().toString());
- }
- }
- if (doMerge) {
- // add the merge MR job
- setupMapRedWork(conf, work, trgtSize, totalSz);
-
- // add the move task for those partitions that do not need merging
- if (toMove.size() > 0) {
- // modify the existing move task as it is already in the candidate running tasks
-
- // running the MoveTask and MR task in parallel may
- // cause the mvTask write to /ds=1 and MR task write
- // to /ds=1_1 for the same partition.
- // make the MoveTask as the child of the MR Task
- resTsks.add(mrAndMvTask);
-
- MoveWork mvWork = (MoveWork) mvTask.getWork();
- LoadFileDesc lfd = mvWork.getLoadFileWork();
-
- String targetDir = lfd.getTargetDir();
- List<String> targetDirs = new ArrayList<String>(toMove.size());
- int numDPCols = dpCtx.getNumDPCols();
-
- for (int i = 0; i < toMove.size(); i++) {
- String toMoveStr = toMove.get(i);
- if (toMoveStr.endsWith(Path.SEPARATOR)) {
- toMoveStr = toMoveStr.substring(0, toMoveStr.length() - 1);
- }
- String [] moveStrSplits = toMoveStr.split(Path.SEPARATOR);
- int dpIndex = moveStrSplits.length - numDPCols;
- String target = targetDir;
- while (dpIndex < moveStrSplits.length) {
- target = target + Path.SEPARATOR + moveStrSplits[dpIndex];
- dpIndex ++;
- }
-
- targetDirs.add(target);
- }
-
- LoadMultiFilesDesc lmfd = new LoadMultiFilesDesc(toMove,
- targetDirs, lfd.getIsDfsDir(), lfd.getColumns(), lfd.getColumnTypes());
- mvWork.setLoadFileWork(null);
- mvWork.setLoadTableWork(null);
- mvWork.setMultiFilesDesc(lmfd);
- } else {
- resTsks.add(mrTask);
- }
- } else { // add the move task
- resTsks.add(mvTask);
- }
+ generateActualTasks(conf, resTsks, trgtSize, avgConditionSize, mvTask, mrTask,
+ mrAndMvTask, dirPath, inpFs, ctx, work, dpLbLevel);
} else { // no dynamic partitions
- long totalSz = getMergeSize(inpFs, dirPath, avgConditionSize);
- if (totalSz >= 0) { // add the merge job
- setupMapRedWork(conf, work, trgtSize, totalSz);
- resTsks.add(mrTask);
- } else { // don't need to merge, add the move job
- resTsks.add(mvTask);
+ if(lbLevel == 0) {
+ // static partition without list bucketing
+ long totalSz = getMergeSize(inpFs, dirPath, avgConditionSize);
+ if (totalSz >= 0) { // add the merge job
+ setupMapRedWork(conf, work, trgtSize, totalSz);
+ resTsks.add(mrTask);
+ } else { // don't need to merge, add the move job
+ resTsks.add(mvTask);
+ }
+ } else {
+ // static partition and list bucketing
+ generateActualTasks(conf, resTsks, trgtSize, avgConditionSize, mvTask, mrTask,
+ mrAndMvTask, dirPath, inpFs, ctx, work, lbLevel);
}
}
} else {
@@ -244,6 +194,131 @@ public class ConditionalResolverMergeFil
return resTsks;
}
+ /**
+ * This method generates actual task for conditional tasks. It could be
+ * 1. move task only
+ * 2. merge task only
+ * 3. merge task followed by a move task.
+ * It used to be true for dynamic partition only since static partition doesn't have #3.
+ * It changes w/ list bucketing. Static partition has #3 since it has sub-directories.
+ * For example, if a static partition is defined as skewed and stored-as-directores,
+ * instead of all files in one directory, it will create a sub-dir per skewed value plus
+ * default directory. So #3 is required for static partition.
+ * So, we move it to a method so that it can be used by both SP and DP.
+ * @param conf
+ * @param resTsks
+ * @param trgtSize
+ * @param avgConditionSize
+ * @param mvTask
+ * @param mrTask
+ * @param mrAndMvTask
+ * @param dirPath
+ * @param inpFs
+ * @param ctx
+ * @param work
+ * @param dpLbLevel
+ * @throws IOException
+ */
+ private void generateActualTasks(HiveConf conf, List<Task<? extends Serializable>> resTsks,
+ long trgtSize, long avgConditionSize, Task<? extends Serializable> mvTask,
+ Task<? extends Serializable> mrTask, Task<? extends Serializable> mrAndMvTask, Path dirPath,
+ FileSystem inpFs, ConditionalResolverMergeFilesCtx ctx, MapredWork work, int dpLbLevel)
+ throws IOException {
+ DynamicPartitionCtx dpCtx = ctx.getDPCtx();
+ // get list of dynamic partitions
+ FileStatus[] status = Utilities.getFileStatusRecurse(dirPath, dpLbLevel, inpFs);
+
+ // cleanup pathToPartitionInfo
+ Map<String, PartitionDesc> ptpi = work.getPathToPartitionInfo();
+ assert ptpi.size() == 1;
+ String path = ptpi.keySet().iterator().next();
+ PartitionDesc partDesc = ptpi.get(path);
+ TableDesc tblDesc = partDesc.getTableDesc();
+ ptpi.remove(path); // the root path is not useful anymore
+
+ // cleanup pathToAliases
+ Map<String, ArrayList<String>> pta = work.getPathToAliases();
+ assert pta.size() == 1;
+ path = pta.keySet().iterator().next();
+ ArrayList<String> aliases = pta.get(path);
+ pta.remove(path); // the root path is not useful anymore
+
+ // populate pathToPartitionInfo and pathToAliases w/ DP paths
+ long totalSz = 0;
+ boolean doMerge = false;
+ // list of paths that don't need to merge but need to move to the dest location
+ List<String> toMove = new ArrayList<String>();
+ for (int i = 0; i < status.length; ++i) {
+ long len = getMergeSize(inpFs, status[i].getPath(), avgConditionSize);
+ if (len >= 0) {
+ doMerge = true;
+ totalSz += len;
+ PartitionDesc pDesc = (dpCtx != null) ? generateDPFullPartSpec(dpCtx, status, tblDesc, i)
+ : partDesc;
+ work.resolveDynamicPartitionStoredAsSubDirsMerge(conf, status[i].getPath(), tblDesc,
+ aliases, pDesc);
+ } else {
+ toMove.add(status[i].getPath().toString());
+ }
+ }
+ if (doMerge) {
+ // add the merge MR job
+ setupMapRedWork(conf, work, trgtSize, totalSz);
+
+ // add the move task for those partitions that do not need merging
+ if (toMove.size() > 0) {
+ // modify the existing move task as it is already in the candidate running tasks
+
+ // running the MoveTask and MR task in parallel may
+ // cause the mvTask write to /ds=1 and MR task write
+ // to /ds=1_1 for the same partition.
+ // make the MoveTask as the child of the MR Task
+ resTsks.add(mrAndMvTask);
+
+ MoveWork mvWork = (MoveWork) mvTask.getWork();
+ LoadFileDesc lfd = mvWork.getLoadFileWork();
+
+ String targetDir = lfd.getTargetDir();
+ List<String> targetDirs = new ArrayList<String>(toMove.size());
+
+ for (int i = 0; i < toMove.size(); i++) {
+ String toMoveStr = toMove.get(i);
+ if (toMoveStr.endsWith(Path.SEPARATOR)) {
+ toMoveStr = toMoveStr.substring(0, toMoveStr.length() - 1);
+ }
+ String[] moveStrSplits = toMoveStr.split(Path.SEPARATOR);
+ int dpIndex = moveStrSplits.length - dpLbLevel;
+ String target = targetDir;
+ while (dpIndex < moveStrSplits.length) {
+ target = target + Path.SEPARATOR + moveStrSplits[dpIndex];
+ dpIndex++;
+ }
+
+ targetDirs.add(target);
+ }
+
+ LoadMultiFilesDesc lmfd = new LoadMultiFilesDesc(toMove,
+ targetDirs, lfd.getIsDfsDir(), lfd.getColumns(), lfd.getColumnTypes());
+ mvWork.setLoadFileWork(null);
+ mvWork.setLoadTableWork(null);
+ mvWork.setMultiFilesDesc(lmfd);
+ } else {
+ resTsks.add(mrTask);
+ }
+ } else { // add the move task
+ resTsks.add(mvTask);
+ }
+ }
+
+ private PartitionDesc generateDPFullPartSpec(DynamicPartitionCtx dpCtx, FileStatus[] status,
+ TableDesc tblDesc, int i) {
+ Map<String, String> fullPartSpec = new LinkedHashMap<String, String>(
+ dpCtx.getPartSpec());
+ Warehouse.makeSpecFromName(fullPartSpec, status[i].getPath());
+ PartitionDesc pDesc = new PartitionDesc(tblDesc, (LinkedHashMap) fullPartSpec);
+ return pDesc;
+ }
+
private void setupMapRedWork(HiveConf conf, MapredWork work, long targetSize, long totalSize) {
if (work.getNumReduceTasks() > 0) {
int maxReducers = conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java Wed Dec 5 11:59:15 2012
@@ -60,6 +60,7 @@ public class FileSinkDesc extends Abstra
transient private List<FileSinkDesc> linkedFileSinkDesc;
private boolean statsReliable;
+ private ListBucketingCtx lbCtx;
private int maxStatsKeyPrefixLength = -1;
public FileSinkDesc() {
@@ -309,6 +310,20 @@ public class FileSinkDesc extends Abstra
this.statsReliable = statsReliable;
}
+ /**
+ * @return the lbCtx
+ */
+ public ListBucketingCtx getLbCtx() {
+ return lbCtx;
+ }
+
+ /**
+ * @param lbCtx the lbCtx to set
+ */
+ public void setLbCtx(ListBucketingCtx lbCtx) {
+ this.lbCtx = lbCtx;
+ }
+
public List<FileSinkDesc> getLinkedFileSinkDesc() {
return linkedFileSinkDesc;
}
@@ -324,4 +339,5 @@ public class FileSinkDesc extends Abstra
public void setMaxStatsKeyPrefixLength(int maxStatsKeyPrefixLength) {
this.maxStatsKeyPrefixLength = maxStatsKeyPrefixLength;
}
+
}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ListBucketingCtx.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ListBucketingCtx.java?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ListBucketingCtx.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ListBucketingCtx.java Wed Dec 5 11:59:15 2012
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.plan;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPrunerUtils;
+/**
+ * Context for list bucketing.
+ * It's created in SemanticAnalyzer.genFileSinkPlan().
+ * It's used in FileSinkOperator.processOp(), merging files, alter table ...concatenate etc.
+ */
+public class ListBucketingCtx implements Serializable {
+ /**
+ * default serialization ID.
+ */
+ private static final long serialVersionUID = 1L;
+ private List<String> skewedColNames;
+ private List<List<String>> skewedColValues;
+ private Map<List<String>, String> lbLocationMap;
+ private List<Integer> rowSkewedIndex;
+ private boolean isStoredAsSubDirectories;
+ private String defaultKey;
+ private String defaultDirName;
+ private List<String> skewedValuesDirNames;
+
+ public ListBucketingCtx() {
+ rowSkewedIndex = new ArrayList<Integer>();
+ skewedValuesDirNames = new ArrayList<String>();
+ }
+
+ /**
+ * @return the skewedColNames
+ */
+ public List<String> getSkewedColNames() {
+ return skewedColNames;
+ }
+
+ /**
+ * @param skewedColNames the skewedColNames to set
+ */
+ public void setSkewedColNames(List<String> skewedColNames) {
+ this.skewedColNames = skewedColNames;
+ }
+
+ /**
+ * @return the skewedColValues
+ */
+ public List<List<String>> getSkewedColValues() {
+ return skewedColValues;
+ }
+
+ /**
+ * @param skewedColValues the skewedColValues to set
+ */
+ public void setSkewedColValues(List<List<String>> skewedColValues) {
+ this.skewedColValues = skewedColValues;
+ }
+
+ /**
+ * @return the lbLocationMap
+ */
+ public Map<List<String>, String> getLbLocationMap() {
+ return lbLocationMap;
+ }
+
+ /**
+ * @param lbLocationMap the lbLocationMap to set
+ */
+ public void setLbLocationMap(Map<List<String>, String> lbLocationMap) {
+ this.lbLocationMap = lbLocationMap;
+ }
+
+ /**
+ * Match column in skewed column list and record position.
+ * The position will be used in {@link FileSinkOperator} generateListBucketingDirName().
+ * Note that skewed column name matches skewed value in order.
+ *
+ * @param rowSch
+ */
+ public void processRowSkewedIndex(RowSchema rowSch) {
+ if ((this.skewedColNames != null) && (this.skewedColNames.size() > 0) && (rowSch != null)
+ && (rowSch.getSignature() != null) && (rowSch.getSignature().size() > 0)) {
+ List<ColumnInfo> cols = rowSch.getSignature();
+ int hitNo = 0;
+ for (int i = 0; i < cols.size(); i++) {
+ int index = this.skewedColNames.indexOf(cols.get(i).getInternalName());
+ if (index > -1) {
+ hitNo++;
+ rowSkewedIndex.add(index);
+ }
+ }
+ assert (hitNo == this.skewedColNames.size()) : "RowSchema doesn't have all skewed columns."
+ + "Skewed column: " + this.skewedColNames.toString() + ". Rowschema has columns: " + cols;
+ }
+ }
+
+ /**
+ * Calculate skewed value subdirectory directory which is used in
+ * FileSinkOperator.java createKeyForStatsPublisher()
+ * For example, create table test skewed by (key, value) on (('484','val_484')
+ * stored as DIRECTORIES;
+ * after the method, skewedValuesDirNames will contain 2 elements:
+ * key=484/value=val_484
+ * HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME/HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME
+ */
+ public void calculateSkewedValueSubDirList() {
+ if (isSkewedStoredAsDir()) {
+ for (List<String> value : this.skewedColValues) {
+ skewedValuesDirNames.add(FileUtils.makeListBucketingDirName(this.skewedColNames, value));
+ }
+ // creat default dir
+ skewedValuesDirNames.add(FileUtils.makeDefaultListBucketingDirName(
+ this.skewedColNames,
+ ListBucketingPrunerUtils.HIVE_LIST_BUCKETING_DEFAULT_DIR_NAME));
+ }
+ }
+
+ /**
+ * @return the rowSkewedIndex
+ */
+ public List<Integer> getRowSkewedIndex() {
+ return rowSkewedIndex;
+ }
+
+ /**
+ * @param rowSkewedIndex the rowSkewedIndex to set
+ */
+ public void setRowSkewedIndex(List<Integer> rowSkewedIndex) {
+ this.rowSkewedIndex = rowSkewedIndex;
+ }
+
+ /**
+ * @return the isStoredAsSubDirectories
+ */
+ public boolean isStoredAsSubDirectories() {
+ return isStoredAsSubDirectories;
+ }
+
+ /**
+ * @param isStoredAsSubDirectories the isStoredAsSubDirectories to set
+ */
+ public void setStoredAsSubDirectories(boolean isStoredAsSubDirectories) {
+ this.isStoredAsSubDirectories = isStoredAsSubDirectories;
+ }
+
+ /**
+ * @return the defaultKey
+ */
+ public String getDefaultKey() {
+ return defaultKey;
+ }
+
+ /**
+ * @param defaultKey the defaultKey to set
+ */
+ public void setDefaultKey(String defaultKey) {
+ this.defaultKey = defaultKey;
+ }
+
+ /**
+ * @return the defaultDirName
+ */
+ public String getDefaultDirName() {
+ return defaultDirName;
+ }
+
+ /**
+ * @param defaultDirName the defaultDirName to set
+ */
+ public void setDefaultDirName(String defaultDirName) {
+ this.defaultDirName = defaultDirName;
+ }
+
+ /**
+ * check if list bucketing is enabled.
+ *
+ * @param ctx
+ * @return
+ */
+ public boolean isSkewedStoredAsDir() {
+ return (this.getSkewedColNames() != null)
+ && (this.getSkewedColNames().size() > 0)
+ && (this.getSkewedColValues() != null)
+ && (this.getSkewedColValues().size() > 0)
+ && (this.isStoredAsSubDirectories());
+ }
+
+ /**
+ * Calculate list bucketing level.
+ *
+ * 0: not list bucketing
+ * int: no. of skewed columns
+ *
+ * @param ctx
+ * @return
+ */
+ public int calculateListBucketingLevel() {
+ int lbLevel = isSkewedStoredAsDir() ? this.getSkewedColNames().size() : 0;
+ return lbLevel;
+ }
+
+ /**
+ * @return the skewedValuesDirNames
+ */
+ public List<String> getSkewedValuesDirNames() {
+ return skewedValuesDirNames;
+ }
+
+ /**
+ * @param skewedValuesDirNames the skewedValuesDirNames to set
+ */
+ public void setSkewedValuesDirNames(List<String> skewedValuesDirNames) {
+ this.skewedValuesDirNames = skewedValuesDirNames;
+ }
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/LoadTableDesc.java Wed Dec 5 11:59:15 2012
@@ -32,8 +32,9 @@ public class LoadTableDesc extends org.a
private boolean replace;
private String tmpDir;
private DynamicPartitionCtx dpCtx;
+ private ListBucketingCtx lbCtx;
private boolean holdDDLTime;
- private boolean inheritTableSpecs = true; //For partitions, flag controlling whether the current
+ private boolean inheritTableSpecs = true; //For partitions, flag controlling whether the current
//table specs are to be used
// TODO: the below seems like they should just be combined into partitionDesc
@@ -138,4 +139,18 @@ public class LoadTableDesc extends org.a
public void setInheritTableSpecs(boolean inheritTableSpecs) {
this.inheritTableSpecs = inheritTableSpecs;
}
+
+ /**
+ * @return the lbCtx
+ */
+ public ListBucketingCtx getLbCtx() {
+ return lbCtx;
+ }
+
+ /**
+ * @param lbCtx the lbCtx to set
+ */
+ public void setLbCtx(ListBucketingCtx lbCtx) {
+ this.lbCtx = lbCtx;
+ }
}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java Wed Dec 5 11:59:15 2012
@@ -34,7 +34,6 @@ import org.apache.hadoop.hive.ql.exec.Ut
import org.apache.hadoop.hive.ql.parse.OpParseContext;
import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SplitSample;
-import org.apache.hadoop.hive.ql.session.SessionState;
/**
* MapredWork.
@@ -161,31 +160,13 @@ public class MapredWork extends Abstract
while (itr.hasNext()) {
final Entry<String, ArrayList<String>> entry = itr.next();
String origiKey = entry.getKey();
- String newKey = removePrefixFromWarehouseConfig(origiKey);
+ String newKey = PlanUtils.removePrefixFromWarehouseConfig(origiKey);
ArrayList<String> value = entry.getValue();
trunPathToAliases.put(newKey, value);
}
return trunPathToAliases;
}
- /**
- * Remove prefix from "Path -> Alias"
- *
- * @param origiKey
- * @return
- */
- private String removePrefixFromWarehouseConfig(String origiKey) {
- String prefix = SessionState.get().getConf().getVar(HiveConf.ConfVars.METASTOREWAREHOUSE);
- if ((prefix != null) && (prefix.length() > 0)) {
- //Local file system is using pfile:/// {@link ProxyLocalFileSystem}
- prefix = prefix.replace("pfile:///", "pfile:/");
- int index = origiKey.indexOf(prefix);
- if (index > -1) {
- origiKey = origiKey.substring(index + prefix.length());
- }
- }
- return origiKey;
- }
@Explain(displayName = "Path -> Partition", normalExplain = false)
@@ -499,7 +480,7 @@ public class MapredWork extends Abstract
this.inputFormatSorted = inputFormatSorted;
}
- public void resolveDynamicPartitionMerge(HiveConf conf, Path path,
+ public void resolveDynamicPartitionStoredAsSubDirsMerge(HiveConf conf, Path path,
TableDesc tblDesc, ArrayList<String> aliases, PartitionDesc partDesc) {
pathToAliases.put(path.toString(), aliases);
pathToPartitionInfo.put(path.toString(), partDesc);
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java Wed Dec 5 11:59:15 2012
@@ -45,6 +45,7 @@ import org.apache.hadoop.hive.ql.metadat
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory;
+import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.DelimitedJSONSerDe;
import org.apache.hadoop.hive.serde2.Deserializer;
@@ -731,6 +732,28 @@ public final class PlanUtils {
return val;
}
+ /**
+ * Remove prefix from "Path -> Alias"
+ * This is required for testing.
+ * In order to verify that path is right, we need to display it in expected test result.
+ * But, mask pattern masks path with some patterns.
+ * So, we need to remove prefix from path which triggers mask pattern.
+ * @param origiKey
+ * @return
+ */
+ public static String removePrefixFromWarehouseConfig(String origiKey) {
+ String prefix = SessionState.get().getConf().getVar(HiveConf.ConfVars.METASTOREWAREHOUSE);
+ if ((prefix != null) && (prefix.length() > 0)) {
+ //Local file system is using pfile:/// {@link ProxyLocalFileSystem}
+ prefix = prefix.replace("pfile:///", "pfile:/");
+ int index = origiKey.indexOf(prefix);
+ if (index > -1) {
+ origiKey = origiKey.substring(index + prefix.length());
+ }
+ }
+ return origiKey;
+ }
+
private PlanUtils() {
// prevent instantiation
}
Modified: hive/trunk/ql/src/test/queries/clientnegative/column_change_skewedcol_type1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/column_change_skewedcol_type1.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/column_change_skewedcol_type1.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/column_change_skewedcol_type1.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
CREATE TABLE skewedtable (key STRING, value STRING) SKEWED BY (key) ON (1,5,6);
Modified: hive/trunk/ql/src/test/queries/clientnegative/column_rename5.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/column_rename5.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/column_rename5.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/column_rename5.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
CREATE TABLE skewedtable (key STRING, value STRING) SKEWED BY (key) ON (1,5,6);
Modified: hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_col_name_value_no_mismatch.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_col_name_value_no_mismatch.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_col_name_value_no_mismatch.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_col_name_value_no_mismatch.q Wed Dec 5 11:59:15 2012
@@ -1,4 +1,3 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
CREATE TABLE skewed_table (key STRING, value STRING) SKEWED BY (key) ON ((1),(5,8),(6));
Modified: hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_dup_col_name.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_dup_col_name.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_dup_col_name.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_dup_col_name.q Wed Dec 5 11:59:15 2012
@@ -1,4 +1,3 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
CREATE TABLE skewed_table (key STRING, value STRING) SKEWED BY (key,key) ON ((1),(5),(6));
Modified: hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_failure_invalid_col_name.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_failure_invalid_col_name.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_failure_invalid_col_name.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/create_skewed_table_failure_invalid_col_name.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
CREATE TABLE skewed_table (key STRING, value STRING) SKEWED BY (key_non) ON ((1),(5),(6));
\ No newline at end of file
Modified: hive/trunk/ql/src/test/queries/clientnegative/invalid_config1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/invalid_config1.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/invalid_config1.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/invalid_config1.q Wed Dec 5 11:59:15 2012
@@ -1,4 +1,3 @@
-
-set hive.internal.ddl.list.bucketing.enable=true;
+set mapred.input.dir.recursive=true;
CREATE TABLE skewedtable (key STRING, value STRING) SKEWED BY (key) ON (1,5,6);
Modified: hive/trunk/ql/src/test/queries/clientnegative/load_stored_as_dirs.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/load_stored_as_dirs.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/load_stored_as_dirs.q (original)
+++ hive/trunk/ql/src/test/queries/clientnegative/load_stored_as_dirs.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
-- Load data can't work with table with stored as directories
CREATE TABLE if not exists stored_as_dirs_multiple (col1 STRING, col2 int, col3 STRING)
Modified: hive/trunk/ql/src/test/queries/clientpositive/alter_skewed_table.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/alter_skewed_table.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/alter_skewed_table.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/alter_skewed_table.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
create table original (key STRING, value STRING);
Modified: hive/trunk/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/create_alter_list_bucketing_table1.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
-- Test stored as directories
-- it covers a few cases
Modified: hive/trunk/ql/src/test/queries/clientpositive/create_skewed_table1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/create_skewed_table1.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/create_skewed_table1.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/create_skewed_table1.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
CREATE TABLE list_bucket_single (key STRING, value STRING) SKEWED BY (key) ON ('1','5','6');
CREATE TABLE list_bucket_single_2 (key STRING, value STRING) SKEWED BY (key) ON ((1),(5),(6));
CREATE TABLE list_bucket_multiple (col1 STRING, col2 int, col3 STRING) SKEWED BY (col1, col2) ON (('s1',1), ('s3',3), ('s13',13), ('s78',78));
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_1.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_1.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_1.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,40 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set mapred.input.dir.recursive=true;
+
+-- list bucketing DML : dynamic partition and 2 stage query plan.
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- create a skewed table
+create table list_bucketing_dynamic_part (key String, value String)
+partitioned by (ds String, hr String)
+skewed by (key) on ("484")
+stored as DIRECTORIES
+;
+
+-- list bucketing DML
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds='2008-04-08', hr) select key, value, hr from srcpart where ds='2008-04-08';
+insert overwrite table list_bucketing_dynamic_part partition (ds='2008-04-08', hr) select key, value, hr from srcpart where ds='2008-04-08';
+
+-- check DML result
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='11');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='12');
+
+select count(1) from srcpart where ds='2008-04-08';
+select count(1) from list_bucketing_dynamic_part where ds='2008-04-08';
+
+select key, value from srcpart where ds='2008-04-08' and hr='11' and key = "484";
+set hive.optimize.listbucketing=true;
+explain extended
+select key, value from list_bucketing_dynamic_part where ds='2008-04-08' and hr='11' and key = "484";
+select key, value from list_bucketing_dynamic_part where ds='2008-04-08' and hr='11' and key = "484";
+
+-- clean up resources
+drop table list_bucketing_dynamic_part;
+
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_2.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_2.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_2.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_2.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,71 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.merge.smallfiles.avgsize=200;
+set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set hive.stats.reliable=true;
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- list bucketing DML: static partition. multiple skewed columns.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- 5263 000000_0
+-- 5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
+
+select count(1) from srcpart where ds = '2008-04-08';
+select count(*) from list_bucketing_static_part;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.optimize.listbucketing=true;
+explain extended
+select * from list_bucketing_static_part where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
+select * from list_bucketing_static_part where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
+select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- 51 and val_51 in the table so skewed data for 51 and val_14 should be none
+-- but query should succeed for 51 or 51 and val_14
+select * from srcpart where ds = '2008-04-08' and key = '51';
+select * from list_bucketing_static_part where key = '51';
+select * from srcpart where ds = '2008-04-08' and key = '51' and value = 'val_14';
+select * from list_bucketing_static_part where key = '51' and value = 'val_14';
+
+-- queries with < <= > >= should work for skewed test although we don't benefit from pruning
+select count(1) from srcpart where ds = '2008-04-08' and key < '51';
+select count(1) from list_bucketing_static_part where key < '51';
+select count(1) from srcpart where ds = '2008-04-08' and key <= '51';
+select count(1) from list_bucketing_static_part where key <= '51';
+select count(1) from srcpart where ds = '2008-04-08' and key > '51';
+select count(1) from list_bucketing_static_part where key > '51';
+select count(1) from srcpart where ds = '2008-04-08' and key >= '51';
+select count(1) from list_bucketing_static_part where key >= '51';
+
+-- clean up
+drop table list_bucketing_static_part;
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_3.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_3.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_3.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_3.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,33 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set mapred.input.dir.recursive=true;
+
+-- list bucketing DML : static partition and 2 stage query plan.
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String) partitioned by (ds String, hr String) skewed by (key) on ("484") stored as DIRECTORIES;
+
+-- list bucketing DML
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds='2008-04-08', hr='11') select key, value from srcpart where ds='2008-04-08';
+insert overwrite table list_bucketing_static_part partition (ds='2008-04-08', hr='11') select key, value from srcpart where ds='2008-04-08';
+
+-- check DML result
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
+
+select count(1) from srcpart where ds='2008-04-08';
+select count(1) from list_bucketing_static_part where ds='2008-04-08';
+
+select key, value from srcpart where ds='2008-04-08' and hr='11' and key = "484";
+set hive.optimize.listbucketing=true;
+explain extended
+select key, value from list_bucketing_static_part where ds='2008-04-08' and hr='11' and key = "484";
+select key, value from list_bucketing_static_part where ds='2008-04-08' and hr='11' and key = "484";
+-- clean up resources
+drop table list_bucketing_static_part;
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_4.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_4.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_4.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_4.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,71 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.merge.smallfiles.avgsize=200;
+set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- list bucketing DML: static partition. multiple skewed columns. merge.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- 5263 000000_0
+-- 5263 000001_0
+-- ds=2008-04-08/hr=11/key=103/value=val_103:
+-- 99 000000_0
+-- 99 000001_0
+-- after merge
+-- 142 000000_0
+-- ds=2008-04-08/hr=11/key=484/value=val_484:
+-- 87 000000_0
+-- 87 000001_0
+-- after merge
+-- 118 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
+
+set hive.merge.mapfiles=true;
+set hive.merge.mapredfiles=true;
+-- list bucketing DML with merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
+
+select count(1) from srcpart where ds = '2008-04-08';
+select count(*) from list_bucketing_static_part;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.optimize.listbucketing=true;
+explain extended
+select * from list_bucketing_static_part where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
+select * from list_bucketing_static_part where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
+select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- clean up
+drop table list_bucketing_static_part;
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_5.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_5.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_5.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_5.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,38 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set mapred.input.dir.recursive=true;
+
+-- list bucketing DML: multiple skewed columns. 2 stages
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- create a skewed table
+create table list_bucketing_dynamic_part (key String, value String)
+partitioned by (ds String, hr String)
+skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
+stored as DIRECTORIES;
+
+-- list bucketing DML
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds='2008-04-08', hr) select key, value, hr from srcpart where ds='2008-04-08';
+insert overwrite table list_bucketing_dynamic_part partition (ds='2008-04-08', hr) select key, value, hr from srcpart where ds='2008-04-08';
+
+-- check DML result
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='11');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='12');
+
+select count(1) from srcpart where ds='2008-04-08';
+select count(1) from list_bucketing_dynamic_part where ds='2008-04-08';
+
+select key, value from srcpart where ds='2008-04-08' and key = "103" and value ="val_103";
+set hive.optimize.listbucketing=true;
+explain extended
+select key, value from list_bucketing_dynamic_part where ds='2008-04-08' and key = "103" and value ="val_103";
+select key, value from list_bucketing_dynamic_part where ds='2008-04-08' and key = "103" and value ="val_103";
+
+-- clean up resources
+drop table list_bucketing_dynamic_part;
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_6.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_6.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_6.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,97 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.merge.smallfiles.avgsize=200;
+set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+
+-- list bucketing DML: dynamic partition. multiple skewed columns. merge.
+-- The following explains merge example used in this test case
+-- DML will generated 2 partitions
+-- ds=2008-04-08/hr=a1
+-- ds=2008-04-08/hr=b1
+-- without merge, each partition has more files
+-- ds=2008-04-08/hr=a1 has 2 files
+-- ds=2008-04-08/hr=b1 has 6 files
+-- with merge each partition has more files
+-- ds=2008-04-08/hr=a1 has 1 files
+-- ds=2008-04-08/hr=b1 has 4 files
+-- The following shows file size and name in each directory
+-- hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- without merge
+-- 155 000000_0
+-- 155 000001_0
+-- with merge
+-- 254 000000_0
+-- hr=b1/key=103/value=val_103:
+-- without merge
+-- 99 000000_0
+-- 99 000001_0
+-- with merge
+-- 142 000001_0
+-- hr=b1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- without merge
+-- 5181 000000_0
+-- 5181 000001_0
+-- with merge
+-- 5181 000000_0
+-- 5181 000001_0
+-- hr=b1/key=484/value=val_484
+-- without merge
+-- 87 000000_0
+-- 87 000001_0
+-- with merge
+-- 118 000002_0
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- create a skewed table
+create table list_bucketing_dynamic_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_dynamic_part;
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
+
+set hive.merge.mapfiles=true;
+set hive.merge.mapredfiles=true;
+-- list bucketing DML with merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_dynamic_part;
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
+
+select count(1) from srcpart where ds = '2008-04-08';
+select count(*) from list_bucketing_dynamic_part;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.optimize.listbucketing=true;
+explain extended
+select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
+select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
+select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- clean up
+drop table list_bucketing_dynamic_part;
+
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_7.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_7.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_7.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,70 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.merge.smallfiles.avgsize=200;
+set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+set hive.merge.rcfile.block.level=true;
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- list bucketing DML : dynamic partition (one level) , merge , one skewed column
+-- DML without merge files mixed with small and big files:
+-- ds=2008-04-08/hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/
+-- 155 000000_0
+-- ds=2008-04-08/hr=b1/key=484
+-- 87 000000_0
+-- 87 000001_0
+-- ds=2008-04-08/hr=b1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/
+-- 5201 000000_0
+-- 5201 000001_0
+-- DML with merge will merge small files
+
+-- skewed table
+CREATE TABLE list_bucketing_dynamic_part (key String, value STRING)
+ PARTITIONED BY (ds string, hr string)
+ skewed by (key) on ('484')
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_dynamic_part;
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
+
+set hive.merge.mapfiles=true;
+set hive.merge.mapredfiles=true;
+-- list bucketing DML with merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_dynamic_part;
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
+
+select count(1) from srcpart where ds = '2008-04-08';
+select count(*) from list_bucketing_dynamic_part;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+explain extended
+select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
+select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
+select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- clean up
+drop table list_bucketing_dynamic_part;
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_8.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_8.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_8.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_8.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,90 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.merge.smallfiles.avgsize=200;
+set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+
+-- list bucketing alter table ... concatenate:
+-- Use list bucketing DML to generate mutilple files in partitions by turning off merge
+-- dynamic partition. multiple skewed columns. merge.
+-- The following explains merge example used in this test case
+-- DML will generated 2 partitions
+-- ds=2008-04-08/hr=a1
+-- ds=2008-04-08/hr=b1
+-- without merge, each partition has more files
+-- ds=2008-04-08/hr=a1 has 2 files
+-- ds=2008-04-08/hr=b1 has 6 files
+-- with merge each partition has more files
+-- ds=2008-04-08/hr=a1 has 1 files
+-- ds=2008-04-08/hr=b1 has 4 files
+-- The following shows file size and name in each directory
+-- hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- without merge
+-- 155 000000_0
+-- 155 000001_0
+-- with merge
+-- 254 000000_0
+-- hr=b1/key=103/value=val_103:
+-- without merge
+-- 99 000000_0
+-- 99 000001_0
+-- with merge
+-- 142 000001_0
+-- hr=b1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- without merge
+-- 5181 000000_0
+-- 5181 000001_0
+-- with merge
+-- 5181 000000_0
+-- 5181 000001_0
+-- hr=b1/key=484/value=val_484
+-- without merge
+-- 87 000000_0
+-- 87 000001_0
+-- with merge
+-- 118 000002_0
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- create a skewed table
+create table list_bucketing_dynamic_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_dynamic_part partition (ds = '2008-04-08', hr)
+select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_dynamic_part;
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
+
+set hive.merge.current.job.concatenate.list.bucketing=true;
+-- concatenate the partition and it will merge files
+alter table list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1') concatenate;
+
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+select count(1) from srcpart where ds = '2008-04-08';
+select count(*) from list_bucketing_dynamic_part;
+explain extended
+select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
+select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
+select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- clean up
+drop table list_bucketing_dynamic_part;
+
+
+
Added: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_9.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_9.q?rev=1417374&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_9.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_dml_9.q Wed Dec 5 11:59:15 2012
@@ -0,0 +1,71 @@
+set hive.mapred.supports.subdirectories=true;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+set hive.merge.smallfiles.avgsize=200;
+set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+
+-- list bucketing DML: static partition. multiple skewed columns. merge.
+-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
+-- 5263 000000_0
+-- 5263 000001_0
+-- ds=2008-04-08/hr=11/key=103:
+-- 99 000000_0
+-- 99 000001_0
+-- after merge
+-- 142 000000_0
+-- ds=2008-04-08/hr=11/key=484:
+-- 87 000000_0
+-- 87 000001_0
+-- after merge
+-- 118 000001_0
+
+-- create a skewed table
+create table list_bucketing_static_part (key String, value String)
+ partitioned by (ds String, hr String)
+ skewed by (key) on ('484','103')
+ stored as DIRECTORIES
+ STORED AS RCFILE;
+
+-- list bucketing DML without merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
+
+set hive.merge.mapfiles=true;
+set hive.merge.mapredfiles=true;
+-- list bucketing DML with merge. use bucketize to generate a few small files.
+explain extended
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+insert overwrite table list_bucketing_static_part partition (ds = '2008-04-08', hr = '11')
+select key, value from srcpart where ds = '2008-04-08';
+
+-- check DML result
+show partitions list_bucketing_static_part;
+desc formatted list_bucketing_static_part partition (ds='2008-04-08', hr='11');
+
+select count(1) from srcpart where ds = '2008-04-08';
+select count(*) from list_bucketing_static_part;
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+set hive.optimize.listbucketing=true;
+explain extended
+select * from list_bucketing_static_part where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
+select * from list_bucketing_static_part where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
+select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- clean up
+drop table list_bucketing_static_part;
Modified: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_1.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_1.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_1.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_1.q Wed Dec 5 11:59:15 2012
@@ -1,7 +1,8 @@
-set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
+set hive.mapred.supports.subdirectories=true;
set hive.optimize.listbucketing=true;
set mapred.input.dir.recursive=true;
+set hive.merge.mapfiles=false;
+set hive.merge.mapredfiles=false;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
@@ -17,66 +18,39 @@ set hive.input.format=org.apache.hadoop.
-- 1. pruner only pick up right directory
-- 2. query result is right
--- create 1 table: fact_daily
--- 1. create a few partitions
--- 2. dfs move partition according to list bucketing structure (simulate DML)
--- $/fact_daily/ds=1/hr=4/x=../y=..
--- notes: waste all partitions except ds=1 and hr=4 for list bucketing query test
--- 3. alter it to skewed table and set up location map
--- 4. list bucketing query
--- fact_daily (ds=1 and hr=4) will be used for list bucketing query
-CREATE TABLE fact_daily(x int, y STRING) PARTITIONED BY (ds STRING, hr STRING)
-LOCATION '${hiveconf:hive.metastore.warehouse.dir}/fact_daily';
-
--- create /fact_daily/ds=1/hr=1 directory
-INSERT OVERWRITE TABLE fact_daily PARTITION (ds='1', hr='1')
-SELECT key, value FROM src WHERE key=484;
-
--- create /fact_daily/ds=1/hr=2 directory
-INSERT OVERWRITE TABLE fact_daily PARTITION (ds='1', hr='2')
-SELECT key, value FROM src WHERE key=369 or key=406;
-
--- create /fact_daily/ds=1/hr=3 directory
-INSERT OVERWRITE TABLE fact_daily PARTITION (ds='1', hr='3')
-SELECT key, value FROM src WHERE key=238;
-
-dfs -lsr ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1;
-dfs -mv ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=1 ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=484/y=val_484;
-dfs -mv ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=2 ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME;
-dfs -mv ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=3 ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=238/y=val_238;
-dfs -lsr ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1;
-
--- switch fact_daily to skewed table and point its location to /fact_daily/ds=1
-alter table fact_daily skewed by (x,y) on ((484,'val_484'),(238,'val_238'));
-ALTER TABLE fact_daily ADD PARTITION (ds='1', hr='4');
-
--- set List Bucketing location map
-alter table fact_daily PARTITION (ds = '1', hr='4') set skewed location ((484,'val_484')='${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=484/y=val_484',
-(238,'val_238')='${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=238/y=val_238');
+-- create a skewed table
+create table fact_daily (key String, value String)
+partitioned by (ds String, hr String)
+skewed by (key, value) on (('484','val_484'),('238','val_238'))
+stored as DIRECTORIES;
+
+insert overwrite table fact_daily partition (ds = '1', hr = '4')
+select key, value from src;
+
describe formatted fact_daily PARTITION (ds = '1', hr='4');
-SELECT * FROM fact_daily WHERE ds='1' and hr='4';
+SELECT count(1) FROM fact_daily WHERE ds='1' and hr='4';
-- pruner only pick up skewed-value directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x FROM fact_daily WHERE ( ds='1' and hr='4') and (x=484 and y= 'val_484');
+explain extended SELECT key FROM fact_daily WHERE ( ds='1' and hr='4') and (key='484' and value= 'val_484');
-- List Bucketing Query
-SELECT x FROM fact_daily WHERE ( ds='1' and hr='4') and (x=484 and y= 'val_484');
+SELECT key FROM fact_daily WHERE ( ds='1' and hr='4') and (key='484' and value= 'val_484');
-- pruner only pick up skewed-value directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x,y FROM fact_daily WHERE ( ds='1' and hr='4') and (x=238 and y= 'val_238');
+explain extended SELECT key,value FROM fact_daily WHERE ( ds='1' and hr='4') and (key='238' and value= 'val_238');
-- List Bucketing Query
-SELECT x,y FROM fact_daily WHERE ( ds='1' and hr='4') and (x=238 and y= 'val_238');
+SELECT key,value FROM fact_daily WHERE ( ds='1' and hr='4') and (key='238' and value= 'val_238');
-- pruner only pick up default directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x FROM fact_daily WHERE ( ds='1' and hr='4') and (y = "3");
+explain extended SELECT key FROM fact_daily WHERE ( ds='1' and hr='4') and (value = "3");
-- List Bucketing Query
-SELECT x FROM fact_daily WHERE ( ds='1' and hr='4') and (y = "3");
+SELECT key FROM fact_daily WHERE ( ds='1' and hr='4') and (value = "3");
-- pruner only pick up default directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x,y FROM fact_daily WHERE ( ds='1' and hr='4') and x = 495;
+explain extended SELECT key,value FROM fact_daily WHERE ( ds='1' and hr='4') and key = '495';
-- List Bucketing Query
-SELECT x,y FROM fact_daily WHERE ( ds='1' and hr='4') and x = 369;
+SELECT key,value FROM fact_daily WHERE ( ds='1' and hr='4') and key = '369';
Modified: hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_2.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_2.q?rev=1417374&r1=1417373&r2=1417374&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_2.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/list_bucket_query_multiskew_2.q Wed Dec 5 11:59:15 2012
@@ -1,5 +1,4 @@
set hive.mapred.supports.subdirectories=true;
-set hive.internal.ddl.list.bucketing.enable=true;
set hive.optimize.listbucketing=true;
set mapred.input.dir.recursive=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
@@ -16,64 +15,36 @@ set hive.input.format=org.apache.hadoop.
-- 1. pruner only pick up right directory
-- 2. query result is right
+-- create a skewed table
+create table fact_daily (key String, value String)
+partitioned by (ds String, hr String)
+skewed by (key, value) on (('484','val_484'),('238','val_238'))
+stored as DIRECTORIES;
+
+insert overwrite table fact_daily partition (ds = '1', hr = '4')
+select key, value from src;
--- create 1 table: fact_daily
--- 1. create a few partitions
--- 2. dfs move partition according to list bucketing structure (simulate DML)
--- $/fact_daily/ds=1/hr=4/x=../y=..
--- notes: waste all partitions except ds=1 and hr=4 for list bucketing query test
--- 3. alter it to skewed table and set up location map
--- 4. list bucketing query
--- fact_daily (ds=1 and hr=4) will be used for list bucketing query
-CREATE TABLE fact_daily(x int, y STRING) PARTITIONED BY (ds STRING, hr STRING)
-LOCATION '${hiveconf:hive.metastore.warehouse.dir}/fact_daily';
-
--- create /fact_daily/ds=1/hr=1 directory
-INSERT OVERWRITE TABLE fact_daily PARTITION (ds='1', hr='1')
-SELECT key, value FROM src WHERE key=484;
-
--- create /fact_daily/ds=1/hr=2 directory
-INSERT OVERWRITE TABLE fact_daily PARTITION (ds='1', hr='2')
-SELECT key, value FROM src WHERE key=369 or key=406;
-
--- create /fact_daily/ds=1/hr=3 directory
-INSERT OVERWRITE TABLE fact_daily PARTITION (ds='1', hr='3')
-SELECT key, value FROM src WHERE key=238;
-
-dfs -lsr ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1;
-dfs -mv ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=1 ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=484/y=val_484;
-dfs -mv ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=2 ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME;
-dfs -mv ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=3 ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=238/y=val_238;
-dfs -lsr ${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1;
-
--- switch fact_daily to skewed table and point its location to /fact_daily/ds=1
-alter table fact_daily skewed by (x,y) on ((484,'val_484'),(238,'val_238'));
-ALTER TABLE fact_daily ADD PARTITION (ds='1', hr='4');
-
--- set List Bucketing location map
-alter table fact_daily PARTITION (ds = '1', hr='4') set skewed location ((484,'val_484')='${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=484/y=val_484',
-(238,'val_238')='${hiveconf:hive.metastore.warehouse.dir}/fact_daily/ds=1/hr=4/x=238/y=val_238');
describe formatted fact_daily PARTITION (ds = '1', hr='4');
-SELECT * FROM fact_daily WHERE ds='1' and hr='4';
+SELECT count(1) FROM fact_daily WHERE ds='1' and hr='4';
-- pruner only pick up default directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x,y FROM fact_daily WHERE ds='1' and hr='4' and y= 'val_484';
+explain extended SELECT key, value FROM fact_daily WHERE ds='1' and hr='4' and value= 'val_484';
-- List Bucketing Query
-SELECT x,y FROM fact_daily WHERE ds='1' and hr='4' and y= 'val_484';
+SELECT key, value FROM fact_daily WHERE ds='1' and hr='4' and value= 'val_484';
-- pruner only pick up default directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x FROM fact_daily WHERE ds='1' and hr='4' and x= 406;
+explain extended SELECT key FROM fact_daily WHERE ds='1' and hr='4' and key= '406';
-- List Bucketing Query
-SELECT x,y FROM fact_daily WHERE ds='1' and hr='4' and x= 406;
+SELECT key, value FROM fact_daily WHERE ds='1' and hr='4' and key= '406';
-- pruner only pick up skewed-value directory
-- explain plan shows which directory selected: Truncated Path -> Alias
-explain extended SELECT x,y FROM fact_daily WHERE ds='1' and hr='4' and ( (x=484 and y ='val_484') or (x=238 and y= 'val_238')) ;
+explain extended SELECT key, value FROM fact_daily WHERE ds='1' and hr='4' and ( (key='484' and value ='val_484') or (key='238' and value= 'val_238')) ;
-- List Bucketing Query
-SELECT x,y FROM fact_daily WHERE ds='1' and hr='4' and ( (x=484 and y ='val_484') or (x=238 and y= 'val_238')) ;
+SELECT key, value FROM fact_daily WHERE ds='1' and hr='4' and ( (key='484' and value ='val_484') or (key='238' and value= 'val_238')) ;
-- clean up
drop table fact_daily;
\ No newline at end of file