You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2014/02/16 06:58:16 UTC

svn commit: r1568726 - in /mahout/trunk: CHANGELOG core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java

Author: smarthi
Date: Sun Feb 16 05:58:15 2014
New Revision: 1568726

URL: http://svn.apache.org/r1568726
Log:
MAHOUT-1417: Random decision forest implementation fails in Hadoop 2

Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
    mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1568726&r1=1568725&r2=1568726&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun Feb 16 05:58:15 2014
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 1.0 - unreleased
 
+  MAHOUT-1417: Random decision forest implementation fails in Hadoop 2 (srowen)
+
   MAHOUT-1416: Make access of DecisionForest.read(dataInput) less restricted (Manoj Awasthi via smarthi)
   
   MAHOUT-1415: Clone method on sparse matrices fails if there is an empty row which has not been set explicitly (till.rohrmann via ssc)

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java?rev=1568726&r1=1568725&r2=1568726&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java Sun Feb 16 05:58:15 2014
@@ -35,15 +35,20 @@ import org.apache.mahout.classifier.df.m
 import org.apache.mahout.classifier.df.node.Node;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.List;
 
 /**
  * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit
  */
 public class PartialBuilder extends Builder {
 
+  private static final Logger log = LoggerFactory.getLogger(PartialBuilder.class);
+
   public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
     this(treeBuilder, dataPath, datasetPath, seed, new Configuration());
   }
@@ -73,6 +78,18 @@ public class PartialBuilder extends Buil
     
     job.setInputFormatClass(TextInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+    // For this implementation to work, mapred.map.tasks needs to be set to the actual
+    // number of mappers Hadoop will use:
+    TextInputFormat inputFormat = new TextInputFormat();
+    List<?> splits = inputFormat.getSplits(job);
+    if (splits == null || splits.isEmpty()) {
+      log.warn("Unable to compute number of splits?");
+    } else {
+      int numSplits = splits.size();
+      log.info("Setting mapred.map.tasks = {}", numSplits);
+      conf.setInt("mapred.map.tasks", numSplits);
+    }
   }
   
   @Override

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java?rev=1568726&r1=1568725&r2=1568726&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java Sun Feb 16 05:58:15 2014
@@ -116,8 +116,8 @@ public class Step1Mapper extends MapredM
   }
   
   /**
-   * Compute the number of trees for a given partition. The first partition (0) may be longer than the rest of
-   * partition because of the remainder.
+   * Compute the number of trees for a given partition. The first partitions may be longer
+   * than the rest because of the remainder.
    * 
    * @param numMaps
    *          total number of maps (partitions)
@@ -127,12 +127,9 @@ public class Step1Mapper extends MapredM
    *          partition to compute the number of trees for
    */
   public static int nbTrees(int numMaps, int numTrees, int partition) {
-    int nbTrees = numTrees / numMaps;
-    if (partition == 0) {
-      nbTrees += numTrees - nbTrees * numMaps;
-    }
-    
-    return nbTrees;
+    int treesPerMapper = numTrees / numMaps;
+    int remainder = numTrees - numMaps * treesPerMapper;
+    return treesPerMapper + (partition < remainder ? 1 : 0);
   }
   
   @Override
@@ -162,6 +159,8 @@ public class Step1Mapper extends MapredM
         MapredOutput emOut = new MapredOutput(tree);
         context.write(key, emOut);
       }
+
+      context.progress();
     }
   }