You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hcatalog-commits@incubator.apache.org by tr...@apache.org on 2012/09/25 21:54:22 UTC
svn commit: r1390178 - in /incubator/hcatalog/trunk: CHANGES.txt
src/java/org/apache/hcatalog/common/HCatConstants.java
src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
Author: travis
Date: Tue Sep 25 21:54:22 2012
New Revision: 1390178
URL: http://svn.apache.org/viewvc?rev=1390178&view=rev
Log:
HCATALOG-506 desired number of input splits for large files
Modified:
incubator/hcatalog/trunk/CHANGES.txt
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
Modified: incubator/hcatalog/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/hcatalog/trunk/CHANGES.txt?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/CHANGES.txt (original)
+++ incubator/hcatalog/trunk/CHANGES.txt Tue Sep 25 21:54:22 2012
@@ -40,6 +40,8 @@ Trunk (unreleased changes)
HCAT-427 Document storage-based authorization (lefty via gates)
IMPROVEMENTS
+ HCAT-506 desired number of input splits for large files (gmalewicz via traviscrawford)
+
HCAT-461 Refactor server-extensions as a subproject (traviscrawford)
HCAT-500 HCatStorer should honor user-specified path for external tables (pengfeng via traviscrawford)
Modified: incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
URL: http://svn.apache.org/viewvc/incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java (original)
+++ incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java Tue Sep 25 21:54:22 2012
@@ -79,6 +79,16 @@ public final class HCatConstants {
public static final String HCAT_METASTORE_PRINCIPAL
= HiveConf.ConfVars.METASTORE_KERBEROS_PRINCIPAL.varname;
+ /**
+ * The desired number of input splits produced for each partition. When the
+ * input files are large and few, we want to split them into many splits,
+ * so as to increase the parallelizm of loading the splits. Try also two
+ * other parameters, mapred.min.split.size and mapred.max.split.size, to
+ * control the number of input splits.
+ */
+ public static final String HCAT_DESIRED_PARTITION_NUM_SPLITS =
+ "hcat.desired.partition.num.splits";
+
// IMPORTANT IMPORTANT IMPORTANT!!!!!
//The keys used to store info into the job Configuration.
//If any new keys are added, the HCatStorer needs to be updated. The HCatStorer
Modified: incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
URL: http://svn.apache.org/viewvc/incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java (original)
+++ incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java Tue Sep 25 21:54:22 2012
@@ -142,11 +142,17 @@ public abstract class HCatBaseInputForma
org.apache.hadoop.mapred.InputFormat inputFormat =
getMapRedInputFormat(jobConf, inputFormatClass);
- //Call getSplit on the InputFormat, create an
- //HCatSplit for each underlying split
- //NumSplits is 0 for our purposes
+ //Call getSplit on the InputFormat, create an HCatSplit for each
+ //underlying split. When the desired number of input splits is missing,
+ //use a default number (denoted by zero).
+ //TODO(malewicz): Currently each partition is split independently into
+ //a desired number. However, we want the union of all partitions to be
+ //split into a desired number while maintaining balanced sizes of input
+ //splits.
+ int desiredNumSplits =
+ conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
org.apache.hadoop.mapred.InputSplit[] baseSplits =
- inputFormat.getSplits(jobConf, 0);
+ inputFormat.getSplits(jobConf, desiredNumSplits);
for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
splits.add(new HCatSplit(