You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hcatalog-commits@incubator.apache.org by tr...@apache.org on 2012/09/25 21:54:22 UTC

svn commit: r1390178 - in /incubator/hcatalog/trunk: CHANGES.txt src/java/org/apache/hcatalog/common/HCatConstants.java src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java

Author: travis
Date: Tue Sep 25 21:54:22 2012
New Revision: 1390178

URL: http://svn.apache.org/viewvc?rev=1390178&view=rev
Log:
HCATALOG-506 desired number of input splits for large files

Modified:
    incubator/hcatalog/trunk/CHANGES.txt
    incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
    incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java

Modified: incubator/hcatalog/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/hcatalog/trunk/CHANGES.txt?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/CHANGES.txt (original)
+++ incubator/hcatalog/trunk/CHANGES.txt Tue Sep 25 21:54:22 2012
@@ -40,6 +40,8 @@ Trunk (unreleased changes)
   HCAT-427 Document storage-based authorization (lefty via gates)
 
   IMPROVEMENTS
+  HCAT-506 desired number of input splits for large files (gmalewicz via traviscrawford)
+
   HCAT-461 Refactor server-extensions as a subproject (traviscrawford)
 
   HCAT-500 HCatStorer should honor user-specified path for external tables (pengfeng via traviscrawford)

Modified: incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java
URL: http://svn.apache.org/viewvc/incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java (original)
+++ incubator/hcatalog/trunk/src/java/org/apache/hcatalog/common/HCatConstants.java Tue Sep 25 21:54:22 2012
@@ -79,6 +79,16 @@ public final class HCatConstants {
     public static final String HCAT_METASTORE_PRINCIPAL
         = HiveConf.ConfVars.METASTORE_KERBEROS_PRINCIPAL.varname;
 
+    /**
+     * The desired number of input splits produced for each partition. When the
+     * input files are large and few, we want to split them into many splits,
+     * so as to increase the parallelizm of loading the splits. Try also two
+     * other parameters, mapred.min.split.size and mapred.max.split.size, to
+     * control the number of input splits.
+     */
+    public static final String HCAT_DESIRED_PARTITION_NUM_SPLITS =
+        "hcat.desired.partition.num.splits";
+
     // IMPORTANT IMPORTANT IMPORTANT!!!!!
     //The keys used to store info into the job Configuration.
     //If any new keys are added, the HCatStorer needs to be updated. The HCatStorer

Modified: incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java
URL: http://svn.apache.org/viewvc/incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java?rev=1390178&r1=1390177&r2=1390178&view=diff
==============================================================================
--- incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java (original)
+++ incubator/hcatalog/trunk/src/java/org/apache/hcatalog/mapreduce/HCatBaseInputFormat.java Tue Sep 25 21:54:22 2012
@@ -142,11 +142,17 @@ public abstract class HCatBaseInputForma
             org.apache.hadoop.mapred.InputFormat inputFormat =
                 getMapRedInputFormat(jobConf, inputFormatClass);
 
-            //Call getSplit on the InputFormat, create an
-            //HCatSplit for each underlying split
-            //NumSplits is 0 for our purposes
+            //Call getSplit on the InputFormat, create an HCatSplit for each
+            //underlying split. When the desired number of input splits is missing,
+            //use a default number (denoted by zero).
+            //TODO(malewicz): Currently each partition is split independently into
+            //a desired number. However, we want the union of all partitions to be
+            //split into a desired number while maintaining balanced sizes of input
+            //splits.
+            int desiredNumSplits =
+                conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
             org.apache.hadoop.mapred.InputSplit[] baseSplits =
-                inputFormat.getSplits(jobConf, 0);
+                inputFormat.getSplits(jobConf, desiredNumSplits);
 
             for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
                 splits.add(new HCatSplit(