You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 21:27:30 UTC

svn commit: r909900 [3/4] - in /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df: ./ builder/ callback/ data/ data/conditions/ mapred/ mapred/inmem/ mapred/partial/ mapreduce/ mapreduce/inmem/ mapreduce/partial/ node/ ref/ split/ tools/

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/InterResults.java Sat Feb 13 20:27:25 2010
@@ -26,38 +26,47 @@
 
 /**
  * Stores/Loads the intermediate results of step1 needed by step2.<br>
- * This class should not be needed outside of the partial package, so all its
- * methods are protected.<br>
+ * This class should not be needed outside of the partial package, so all its methods are protected.<br>
  */
 public class InterResults {
-  private InterResults() {
-  }
-
+  private InterResults() {}
+  
   /**
    * Load the trees and the keys returned from the first step
    * 
-   * @param fs forest path file system
-   * @param forestPath file path to the (key,tree) file
-   * @param numMaps number of map tasks
-   * @param numTrees total number of trees in the forest
-   * @param partition current partition
-   * @param keys array of size numTrees, will contain the loaded keys
-   * @param trees array of size numTrees, will contain the loaded trees
+   * @param fs
+   *          forest path file system
+   * @param forestPath
+   *          file path to the (key,tree) file
+   * @param numMaps
+   *          number of map tasks
+   * @param numTrees
+   *          total number of trees in the forest
+   * @param partition
+   *          current partition
+   * @param keys
+   *          array of size numTrees, will contain the loaded keys
+   * @param trees
+   *          array of size numTrees, will contain the loaded trees
    * @return number of instances in the current partition
    * @throws IOException
    */
-  public static int load(FileSystem fs, Path forestPath, int numMaps,
-      int numTrees, int partition, TreeID[] keys, Node[] trees)
-      throws IOException {
+  public static int load(FileSystem fs,
+                         Path forestPath,
+                         int numMaps,
+                         int numTrees,
+                         int partition,
+                         TreeID[] keys,
+                         Node[] trees) throws IOException {
     if (keys.length != trees.length) {
       throw new IllegalArgumentException("keys.length != trees.length");
     }
-
+    
     FSDataInputStream in = fs.open(forestPath);
-
+    
     TreeID key = new TreeID();
     int numInstances = -1;
-
+    
     try {
       // get current partition's size
       for (int p = 0; p < numMaps; p++) {
@@ -67,12 +76,12 @@
           in.readInt();
         }
       }
-
+      
       // load (key, tree)
       int current = 0;
       for (int index = 0; index < numTrees; index++) {
         key.readFields(in);
-
+        
         if (key.partition() == partition) {
           // skip the trees of the current partition
           Node.read(in);
@@ -82,49 +91,52 @@
           current++;
         }
       }
-
+      
       if (current != keys.length) {
         throw new IllegalStateException("loaded less keys/trees than expected");
       }
     } finally {
       in.close();
     }
-
+    
     return numInstances;
   }
-
+  
   /**
    * Write the forest trees into a file
    * 
-   * @param fs File System
-   * @param keys keys returned by the first step
-   * @param trees trees returned by the first step
-   * @param sizes partitions' sizes in hadoop order
+   * @param fs
+   *          File System
+   * @param keys
+   *          keys returned by the first step
+   * @param trees
+   *          trees returned by the first step
+   * @param sizes
+   *          partitions' sizes in hadoop order
    * @throws IOException
    */
-  public static void store(FileSystem fs, Path forestPath,
-      TreeID[] keys, Node[] trees, int[] sizes) throws IOException {
+  public static void store(FileSystem fs, Path forestPath, TreeID[] keys, Node[] trees, int[] sizes) throws IOException {
     if (keys.length != trees.length) {
       throw new IllegalArgumentException("keys.length != trees.length");
     }
-
+    
     int numTrees = keys.length;
     int numMaps = sizes.length;
-
+    
     FSDataOutputStream out = fs.create(forestPath);
-
+    
     // write partitions' sizes
     for (int p = 0; p < numMaps; p++) {
       out.writeInt(sizes[p]);
     }
-
+    
     // write the data
     for (int index = 0; index < numTrees; index++) {
       keys[index].write(out);
       trees[index].write(out);
     }
-
+    
     out.close();
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/PartialBuilder.java Sat Feb 13 20:27:25 2010
@@ -35,20 +35,19 @@
 import org.apache.mahout.df.builder.TreeBuilder;
 import org.apache.mahout.df.callback.PredictionCallback;
 import org.apache.mahout.df.mapreduce.Builder;
-import org.apache.mahout.df.mapreduce.partial.Step0Job.Step0Output;
 import org.apache.mahout.df.mapreduce.MapredOutput;
+import org.apache.mahout.df.mapreduce.partial.Step0Job.Step0Output;
 import org.apache.mahout.df.node.Node;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Builds a random forest using partial data. Each mapper uses only the data
- * given by its InputSplit
+ * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit
  */
 public class PartialBuilder extends Builder {
-
+  
   private static final Logger log = LoggerFactory.getLogger(PartialBuilder.class);
-
+  
   /**
    * Indicates if we should run the second step of the builder.<br>
    * This parameter is only meant for debuging, so we keep it protected.
@@ -59,111 +58,121 @@
   protected static boolean isStep2(Configuration conf) {
     return conf.getBoolean("debug.mahout.rf.partial.step2", true);
   }
-
+  
   /**
    * Should run the second step of the builder ?
    * 
    * @param conf
-   * @param value true to indicate that the second step will be launched
+   * @param value
+   *          true to indicate that the second step will be launched
    * 
    */
   protected static void setStep2(Configuration conf, boolean value) {
     conf.setBoolean("debug.mahout.rf.partial.step2", value);
   }
-
-  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath,
-      Path datasetPath, Long seed) {
+  
+  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
     this(treeBuilder, dataPath, datasetPath, seed, new Configuration());
   }
-
-  public PartialBuilder(TreeBuilder treeBuilder, Path dataPath,
-      Path datasetPath, Long seed, Configuration conf) {
+  
+  public PartialBuilder(TreeBuilder treeBuilder,
+                        Path dataPath,
+                        Path datasetPath,
+                        Long seed,
+                        Configuration conf) {
     super(treeBuilder, dataPath, datasetPath, seed, conf);
   }
-
   
   @Override
-  protected void configureJob(Job job, int nbTrees, boolean oobEstimate)
-      throws IOException {
+  protected void configureJob(Job job, int nbTrees, boolean oobEstimate) throws IOException {
     Configuration conf = job.getConfiguration();
     
     job.setJarByClass(PartialBuilder.class);
     
     FileInputFormat.setInputPaths(job, getDataPath());
     FileOutputFormat.setOutputPath(job, getOutputPath(conf));
-
+    
     job.setOutputKeyClass(TreeID.class);
     job.setOutputValueClass(MapredOutput.class);
-
+    
     job.setMapperClass(Step1Mapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormatClass(TextInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
   }
-
+  
   @Override
-  protected DecisionForest parseOutput(Job job, PredictionCallback callback)
-      throws IOException, ClassNotFoundException, InterruptedException {
+  protected DecisionForest parseOutput(Job job, PredictionCallback callback) throws IOException,
+                                                                            ClassNotFoundException,
+                                                                            InterruptedException {
     Configuration conf = job.getConfiguration();
     
-    int numTrees = getNbTrees(conf);
-
+    int numTrees = Builder.getNbTrees(conf);
+    
     Path outputPath = getOutputPath(conf);
-
-    log.info("Computing partitions' first ids...");
+    
+    PartialBuilder.log.info("Computing partitions' first ids...");
     Step0Job step0 = new Step0Job(getOutputPath(conf), getDataPath(), getDatasetPath());
     Step0Output[] partitions = step0.run(new Configuration(conf));
-
-    log.info("Processing the output...");
+    
+    PartialBuilder.log.info("Processing the output...");
     TreeID[] keys = new TreeID[numTrees];
     Node[] trees = new Node[numTrees];
     int[] firstIds = Step0Output.extractFirstIds(partitions);
-    processOutput(job, outputPath, firstIds, keys, trees, callback);
-
+    PartialBuilder.processOutput(job, outputPath, firstIds, keys, trees, callback);
+    
     // JobClient should have updated numMaps to the correct number of maps
     int numMaps = partitions.length;
-
+    
     // call the second step in order to complete the oob predictions
-    if (callback != null && numMaps > 1 && isStep2(conf)) {
-      log.info("*****************************");
-      log.info("Second Step");
-      log.info("*****************************");
+    if ((callback != null) && (numMaps > 1) && PartialBuilder.isStep2(conf)) {
+      PartialBuilder.log.info("*****************************");
+      PartialBuilder.log.info("Second Step");
+      PartialBuilder.log.info("*****************************");
       Step2Job step2 = new Step2Job(getOutputPath(conf), getDataPath(), getDatasetPath(), partitions);
-
+      
       step2.run(new Configuration(conf), keys, trees, callback);
     }
-
+    
     return new DecisionForest(Arrays.asList(trees));
   }
-
+  
   /**
    * Processes the output from the output path.<br>
    * 
    * @param job
-   * @param outputPath directory that contains the output of the job
-   * @param firstIds partitions' first ids in hadoop's order
-   * @param keys can be null
-   * @param trees can be null
-   * @param callback can be null
+   * @param outputPath
+   *          directory that contains the output of the job
+   * @param firstIds
+   *          partitions' first ids in hadoop's order
+   * @param keys
+   *          can be null
+   * @param trees
+   *          can be null
+   * @param callback
+   *          can be null
    * @throws IOException
    */
-  protected static void processOutput(JobContext job, Path outputPath,
-      int[] firstIds, TreeID[] keys, Node[] trees, PredictionCallback callback)
-      throws IOException {
-    if ((keys != null && trees == null)||(keys == null && trees != null)) {
+  protected static void processOutput(JobContext job,
+                                      Path outputPath,
+                                      int[] firstIds,
+                                      TreeID[] keys,
+                                      Node[] trees,
+                                      PredictionCallback callback) throws IOException {
+    if (((keys != null) && (trees == null)) || ((keys == null) && (trees != null))) {
       throw new IllegalArgumentException("if keys is null, trees should also be null");
     }
-    if (keys != null && keys.length != trees.length) {
+    if ((keys != null) && (keys.length != trees.length)) {
       throw new IllegalArgumentException("keys.length != trees.length");
     }
     
     Configuration conf = job.getConfiguration();
     
     FileSystem fs = outputPath.getFileSystem(conf);
-
+    
     Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+    
     // read all the outputs
     TreeID key = new TreeID();
     MapredOutput value = new MapredOutput();
@@ -171,7 +180,7 @@
     int index = 0;
     for (Path path : outfiles) {
       Reader reader = new Reader(fs, path, conf);
-
+      
       try {
         while (reader.next(key, value)) {
           if (keys != null) {
@@ -182,7 +191,7 @@
             trees[index] = value.getTree();
           }
           
-          processOutput(firstIds, key, value, callback);
+          PartialBuilder.processOutput(firstIds, key, value, callback);
           
           index++;
         }
@@ -190,30 +199,31 @@
         reader.close();
       }
     }
-
+    
     // make sure we got all the keys/values
-    if (keys != null && index != keys.length) {
+    if ((keys != null) && (index != keys.length)) {
       throw new IllegalStateException("Some key/values are missing from the output");
     }
   }
-
+  
   /**
-   * Process the output, extracting the trees and passing the predictions to the
-   * callback
+   * Process the output, extracting the trees and passing the predictions to the callback
    * 
-   * @param firstIds partitions' first ids in hadoop's order
+   * @param firstIds
+   *          partitions' first ids in hadoop's order
    * @param callback
    * @return
    */
-  private static void processOutput(int[] firstIds, TreeID key,
-      MapredOutput value, PredictionCallback callback) {
-
+  private static void processOutput(int[] firstIds,
+                                    TreeID key,
+                                    MapredOutput value,
+                                    PredictionCallback callback) {
+    
     if (callback != null) {
       int[] predictions = value.getPredictions();
-
+      
       for (int instanceId = 0; instanceId < predictions.length; instanceId++) {
-        callback.prediction(key.treeId(), firstIds[key.partition()] + instanceId, 
-            predictions[instanceId]);
+        callback.prediction(key.treeId(), firstIds[key.partition()] + instanceId, predictions[instanceId]);
       }
     }
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step0Job.java Sat Feb 13 20:27:25 2010
@@ -47,25 +47,26 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * preparation step of the partial mapreduce builder. Computes some stats that
- * will be used by the builder.
+ * preparation step of the partial mapreduce builder. Computes some stats that will be used by the builder.
  */
 public class Step0Job {
-
+  
   private static final Logger log = LoggerFactory.getLogger(Step0Job.class);
   
   /** directory that will hold this job's output */
   private final Path outputPath;
-
+  
   /** file that contains the serialized dataset */
   private final Path datasetPath;
-
+  
   /** directory that contains the data used in the first step */
   private final Path dataPath;
-
+  
   /**
-   * @param base base directory
-   * @param dataPath data used in the first step
+   * @param base
+   *          base directory
+   * @param dataPath
+   *          data used in the first step
    * @param datasetPath
    */
   public Step0Job(Path base, Path dataPath, Path datasetPath) {
@@ -73,70 +74,74 @@
     this.dataPath = dataPath;
     this.datasetPath = datasetPath;
   }
-
+  
   /**
    * Computes the partitions' info in Hadoop's order
    * 
-   * @param conf configuration
+   * @param conf
+   *          configuration
    * @return partitions' info in Hadoop's order
    */
-  public Step0Output[] run(Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
+  public Step0Output[] run(Configuration conf) throws IOException,
+                                              ClassNotFoundException,
+                                              InterruptedException {
     
     // check the output
-    if (outputPath.getFileSystem(conf).exists(outputPath))
+    if (outputPath.getFileSystem(conf).exists(outputPath)) {
       throw new IOException("Output path already exists : " + outputPath);
-
+    }
+    
     // put the dataset into the DistributedCache
     // use setCacheFiles() to overwrite the first-step cache files
-    URI[] files = { datasetPath.toUri() };
+    URI[] files = {datasetPath.toUri()};
     DistributedCache.setCacheFiles(files, conf);
-
+    
     Job job = new Job(conf);
     job.setJarByClass(Step0Job.class);
-
+    
     FileInputFormat.setInputPaths(job, dataPath);
     FileOutputFormat.setOutputPath(job, outputPath);
-
+    
     job.setOutputKeyClass(IntWritable.class);
     job.setOutputValueClass(Step0Output.class);
-
+    
     job.setMapperClass(Step0Mapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormatClass(TextInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
-
+    
     // run the job
     job.waitForCompletion(true);
-
+    
     return parseOutput(job);
   }
-
+  
   /**
    * Extracts the output and processes it
-   *
+   * 
    * @return info for each partition in Hadoop's order
    * @throws IOException
    */
   protected Step0Output[] parseOutput(JobContext job) throws IOException {
     Configuration conf = job.getConfiguration();
     
-    log.info("mapred.map.tasks = {}", conf.getInt("mapred.map.tasks", -1));
+    Step0Job.log.info("mapred.map.tasks = {}", conf.getInt("mapred.map.tasks", -1));
     
     FileSystem fs = outputPath.getFileSystem(conf);
-
+    
     Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
-
+    
     List<Integer> keys = new ArrayList<Integer>();
     List<Step0Output> values = new ArrayList<Step0Output>();
-
+    
     // read all the outputs
     IntWritable key = new IntWritable();
     Step0Output value = new Step0Output(0L, 0);
-
+    
     for (Path path : outfiles) {
       Reader reader = new Reader(fs, path, conf);
-
+      
       try {
         while (reader.next(key, value)) {
           keys.add(key.get());
@@ -146,10 +151,10 @@
         reader.close();
       }
     }
-
-    return processOutput(keys, values);
+    
+    return Step0Job.processOutput(keys, values);
   }
-
+  
   /**
    * Replaces the first id for each partition in Hadoop's order
    * 
@@ -164,14 +169,14 @@
     Step0Output[] sorted = new Step0Output[numMaps];
     values.toArray(sorted);
     Arrays.sort(sorted);
-
+    
     // compute the partitions firstIds (file order)
     int[] orderedIds = new int[numMaps];
     orderedIds[0] = 0;
     for (int p = 1; p < numMaps; p++) {
       orderedIds[p] = orderedIds[p - 1] + sorted[p - 1].size;
     }
-
+    
     // update the values' first ids
     for (int p = 0; p < numMaps; p++) {
       int order = ArrayUtils.indexOf(sorted, values.get(p));
@@ -183,28 +188,27 @@
     for (int p = 0; p < numMaps; p++) {
       reordered[keys.get(p)] = values.get(p);
     }
-
+    
     return reordered;
   }
-
+  
   /**
    * Outputs the first key and the size of the partition
    * 
    */
-  protected static class Step0Mapper extends 
-      Mapper<LongWritable, Text, IntWritable, Step0Output> {
-
+  protected static class Step0Mapper extends Mapper<LongWritable,Text,IntWritable,Step0Output> {
+    
     private int partition;
-
+    
     private int size;
-
+    
     private Long firstId;
-
+    
     @Override
     protected void setup(Context context) throws IOException, InterruptedException {
       configure(context.getConfiguration().getInt("mapred.task.partition", -1));
     }
-
+    
     /**
      * Useful when testing
      * 
@@ -216,64 +220,64 @@
         throw new IllegalArgumentException("Wrong partition id : " + partition);
       }
     }
-
+    
     @Override
-    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+    protected void map(LongWritable key, Text value, Context context) throws IOException,
+                                                                     InterruptedException {
       if (firstId == null) {
         firstId = key.get();
       }
-
+      
       size++;
     }
-
+    
     @Override
     protected void cleanup(Context context) throws IOException, InterruptedException {
       context.write(new IntWritable(partition), new Step0Output(firstId, size));
     }
-
+    
   }
-
+  
   /**
    * Output of the step0's mappers
    * 
    */
-  public static class Step0Output implements Writable,
-      Comparable<Step0Output>, Cloneable {
-
+  public static class Step0Output implements Writable, Comparable<Step0Output>, Cloneable {
+    
     /**
      * first key of the partition<br>
      * used to sort the partition
      */
     private long firstId;
-
+    
     /** number of instances in the partition */
     private int size;
-
+    
     protected Step0Output(long firstId, int size) {
       this.firstId = firstId;
       this.size = size;
     }
-
+    
     protected long getFirstId() {
       return firstId;
     }
-
+    
     protected int getSize() {
       return size;
     }
-
+    
     @Override
     public void readFields(DataInput in) throws IOException {
       firstId = in.readLong();
       size = in.readInt();
     }
-
+    
     @Override
     public void write(DataOutput out) throws IOException {
       out.writeLong(firstId);
       out.writeInt(size);
     }
-
+    
     @Override
     public boolean equals(Object other) {
       if (!(other instanceof Step0Output)) {
@@ -281,44 +285,45 @@
       }
       return firstId == ((Step0Output) other).firstId;
     }
-
+    
     @Override
     public int hashCode() {
       return (int) firstId;
     }
-
+    
     @Override
     protected Step0Output clone() {
       return new Step0Output(firstId, size);
     }
-
+    
     @Override
     public int compareTo(Step0Output obj) {
-      if (firstId < obj.firstId)
+      if (firstId < obj.firstId) {
         return -1;
-      else if (firstId > obj.firstId)
+      } else if (firstId > obj.firstId) {
         return 1;
-      else
+      } else {
         return 0;
+      }
     }
-
+    
     public static int[] extractFirstIds(Step0Output[] partitions) {
       int[] ids = new int[partitions.length];
       
       for (int p = 0; p < partitions.length; p++) {
         ids[p] = (int) partitions[p].firstId;
       }
-
+      
       return ids;
     }
-
+    
     public static int[] extractSizes(Step0Output[] partitions) {
       int[] sizes = new int[partitions.length];
       
       for (int p = 0; p < partitions.length; p++) {
         sizes[p] = partitions[p].size;
       }
-
+      
       return sizes;
     }
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step1Mapper.java Sat Feb 13 20:27:25 2010
@@ -39,92 +39,95 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * First step of the Partial Data Builder. Builds the trees using the data
- * available in the InputSplit. Predict the oob classes for each tree in its
- * growing partition (input split).
+ * First step of the Partial Data Builder. Builds the trees using the data available in the InputSplit.
+ * Predict the oob classes for each tree in its growing partition (input split).
  */
-public class Step1Mapper extends
-    MapredMapper<LongWritable, Text, TreeID, MapredOutput> {
-
+public class Step1Mapper extends MapredMapper<LongWritable,Text,TreeID,MapredOutput> {
+  
   private static final Logger log = LoggerFactory.getLogger(Step1Mapper.class);
-
+  
   /** used to convert input values to data instances */
   private DataConverter converter;
-
+  
   private Random rng;
-
+  
   /** number of trees to be built by this mapper */
   private int nbTrees;
-
+  
   /** id of the first tree */
   private int firstTreeId;
-
+  
   /** mapper's partition */
   private int partition;
-
+  
   /** will contain all instances if this mapper's split */
   private final List<Instance> instances = new ArrayList<Instance>();
   
   public int getFirstTreeId() {
     return firstTreeId;
   }
-
+  
   @Override
-  protected void setup(Context context) throws IOException,
-      InterruptedException {
+  protected void setup(Context context) throws IOException, InterruptedException {
     super.setup(context);
     Configuration conf = context.getConfiguration();
-
-    configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition",
-        -1), Builder.getNumMaps(conf), Builder.getNbTrees(conf));
+    
+    configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition", -1),
+      Builder.getNumMaps(conf), Builder.getNbTrees(conf));
   }
-
+  
   /**
    * Useful when testing
    * 
    * @param seed
-   * @param partition current mapper inputSplit partition
-   * @param numMapTasks number of running map tasks
-   * @param numTrees total number of trees in the forest
+   * @param partition
+   *          current mapper inputSplit partition
+   * @param numMapTasks
+   *          number of running map tasks
+   * @param numTrees
+   *          total number of trees in the forest
    */
-  protected void configure(Long seed, int partition, int numMapTasks,
-      int numTrees) {
+  protected void configure(Long seed, int partition, int numMapTasks, int numTrees) {
     converter = new DataConverter(getDataset());
-
+    
     // prepare random-numders generator
-    log.debug("seed : {}", seed);
-    if (seed == null)
+    Step1Mapper.log.debug("seed : {}", seed);
+    if (seed == null) {
       rng = RandomUtils.getRandom();
-    else
+    } else {
       rng = RandomUtils.getRandom(seed);
-
+    }
+    
     // mapper's partition
     if (partition < 0) {
       throw new IllegalArgumentException("Wrong partition ID");
     }
     this.partition = partition;
-
+    
     // compute number of trees to build
-    nbTrees = nbTrees(numMapTasks, numTrees, partition);
-
+    nbTrees = Step1Mapper.nbTrees(numMapTasks, numTrees, partition);
+    
     // compute first tree id
     firstTreeId = 0;
     for (int p = 0; p < partition; p++) {
-      firstTreeId += nbTrees(numMapTasks, numTrees, p);
+      firstTreeId += Step1Mapper.nbTrees(numMapTasks, numTrees, p);
     }
-
-    log.debug("partition : {}", partition);
-    log.debug("nbTrees : {}", nbTrees);
-    log.debug("firstTreeId : {}", firstTreeId);
+    
+    Step1Mapper.log.debug("partition : {}", partition);
+    Step1Mapper.log.debug("nbTrees : {}", nbTrees);
+    Step1Mapper.log.debug("firstTreeId : {}", firstTreeId);
   }
-
+  
   /**
-   * Compute the number of trees for a given partition. The first partition (0)
-   * may be longer than the rest of partition because of the remainder.
+   * Compute the number of trees for a given partition. The first partition (0) may be longer than the rest of
+   * partition because of the remainder.
    * 
-   * @param numMaps total number of maps (partitions)
-   * @param numTrees total number of trees to build
-   * @param partition partition to compute the number of trees for
+   * @param numMaps
+   *          total number of maps (partitions)
+   * @param numTrees
+   *          total number of trees to build
+   * @param partition
+   *          partition to compute the number of trees for
    * @return
    */
   public static int nbTrees(int numMaps, int numTrees, int partition) {
@@ -132,46 +135,44 @@
     if (partition == 0) {
       nbTrees += numTrees - nbTrees * numMaps;
     }
-
+    
     return nbTrees;
   }
-
   
   @Override
   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
     instances.add(converter.convert((int) key.get(), value.toString()));
   }
-
   
   @Override
   protected void cleanup(Context context) throws IOException, InterruptedException {
     // prepare the data
-    log.debug("partition: {} numInstances: {}", partition, instances.size());
-
+    Step1Mapper.log.debug("partition: {} numInstances: {}", partition, instances.size());
+    
     Data data = new Data(getDataset(), instances);
     Bagging bagging = new Bagging(getTreeBuilder(), data);
-
+    
     TreeID key = new TreeID();
-
-    log.debug("Building {} trees", nbTrees);
+    
+    Step1Mapper.log.debug("Building {} trees", nbTrees);
     SingleTreePredictions callback = null;
     int[] predictions = null;
     for (int treeId = 0; treeId < nbTrees; treeId++) {
-      log.debug("Building tree number : {}", treeId);
+      Step1Mapper.log.debug("Building tree number : {}", treeId);
       if (isOobEstimate() && !isNoOutput()) {
         callback = new SingleTreePredictions(data.size());
         predictions = callback.getPredictions();
       }
-
+      
       Node tree = bagging.build(treeId, rng, callback);
-
+      
       key.set(partition, firstTreeId + treeId);
-
+      
       if (!isNoOutput()) {
         MapredOutput emOut = new MapredOutput(tree, predictions);
         context.write(key, emOut);
       }
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Job.java Sat Feb 13 20:27:25 2010
@@ -35,31 +35,33 @@
 import org.apache.mahout.df.node.Node;
 
 /**
- * 2nd step of the partial mapreduce builder. Computes the oob predictions using
- * all the trees of the forest
+ * 2nd step of the partial mapreduce builder. Computes the oob predictions using all the trees of the forest
  */
 public class Step2Job {
   
   /** directory that will hold this job's output */
   private final Path outputPath;
-
+  
   /** file that will contains the forest, passed to the maps */
   private final Path forestPath;
-
+  
   /** file that contains the serialized dataset */
   private final Path datasetPath;
-
+  
   /** directory that contains the data used in the first step */
   private final Path dataPath;
-
+  
   /** partitions info in Hadoop's order */
   private final Step0Output[] partitions;
   
   /**
-   * @param base base directory
-   * @param dataPath data used in the first step
+   * @param base
+   *          base directory
+   * @param dataPath
+   *          data used in the first step
    * @param datasetPath
-   * @param partitions partitions' infos in hadoop's order
+   * @param partitions
+   *          partitions' infos in hadoop's order
    */
   public Step2Job(Path base, Path dataPath, Path datasetPath, Step0Output[] partitions) {
     this.outputPath = new Path(base, "step2.output");
@@ -68,52 +70,57 @@
     this.datasetPath = datasetPath;
     this.partitions = partitions;
   }
-
+  
   /**
    * Run the second step.
    * 
-   * @param conf configuration
-   * @param keys keys returned by the first step
-   * @param trees trees returned by the first step
+   * @param conf
+   *          configuration
+   * @param keys
+   *          keys returned by the first step
+   * @param trees
+   *          trees returned by the first step
    * @param callback
    */
-  public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback)
-      throws IOException, ClassNotFoundException, InterruptedException {
+  public void run(Configuration conf, TreeID[] keys, Node[] trees, PredictionCallback callback) throws IOException,
+                                                                                               ClassNotFoundException,
+                                                                                               InterruptedException {
     if (callback == null) {
       // no need to launch the job
       return;
     }
-
+    
     int numTrees = keys.length;
     
     // check the output
-    if (outputPath.getFileSystem(conf).exists(outputPath))
+    if (outputPath.getFileSystem(conf).exists(outputPath)) {
       throw new IOException("Output path already exists : " + outputPath);
-
+    }
+    
     int[] sizes = Step0Output.extractSizes(partitions);
     
     InterResults.store(forestPath.getFileSystem(conf), forestPath, keys, trees, sizes);
-
+    
     // needed by the mapper
     Builder.setNbTrees(conf, numTrees);
-
+    
     // put the dataset and the forest into the DistributedCache
     // use setCacheFiles() to overwrite the first-step cache files
-    URI[] files = { datasetPath.toUri(), forestPath.toUri() };
+    URI[] files = {datasetPath.toUri(), forestPath.toUri()};
     DistributedCache.setCacheFiles(files, conf);
-
+    
     Job job = new Job(conf);
     job.setJarByClass(Step2Job.class);
-
+    
     FileInputFormat.setInputPaths(job, dataPath);
     FileOutputFormat.setOutputPath(job, outputPath);
-
+    
     job.setOutputKeyClass(TreeID.class);
     job.setOutputValueClass(MapredOutput.class);
-
+    
     job.setMapperClass(Step2Mapper.class);
     job.setNumReduceTasks(0); // no reducers
-
+    
     job.setInputFormatClass(TextInputFormat.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     
@@ -122,7 +129,7 @@
       parseOutput(job, callback);
     }
   }
-
+  
   /**
    * Extracts the output and processes it
    * 
@@ -130,19 +137,18 @@
    * @param callback
    * @throws IOException
    */
-  protected void parseOutput(Job job, PredictionCallback callback)
-      throws IOException {
+  protected void parseOutput(Job job, PredictionCallback callback) throws IOException {
     Configuration conf = job.getConfiguration();
     
     int numMaps = Builder.getNumMaps(conf);
     int numTrees = Builder.getNbTrees(conf);
-
+    
     // compute the total number of output values
     int total = 0;
     for (int partition = 0; partition < numMaps; partition++) {
       total += Step2Mapper.nbConcerned(numMaps, numTrees, partition);
     }
-
+    
     int[] firstIds = Step0Output.extractFirstIds(partitions);
     PartialBuilder.processOutput(job, outputPath, firstIds, null, null, callback);
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/Step2Mapper.java Sat Feb 13 20:27:25 2010
@@ -39,27 +39,25 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Second step of PartialBuilder. Using the trees of the first step, computes
- * the oob predictions for each tree, except those of its own partition, on all
- * instancesof the partition.
+ * Second step of PartialBuilder. Using the trees of the first step, computes the oob predictions for each
+ * tree, except those of its own partition, on all instancesof the partition.
  */
-public class Step2Mapper extends Mapper<LongWritable, Text, TreeID, MapredOutput> {
-
+public class Step2Mapper extends Mapper<LongWritable,Text,TreeID,MapredOutput> {
+  
   private static final Logger log = LoggerFactory.getLogger(Step2Mapper.class);
-
+  
   private TreeID[] keys;
-
+  
   private Node[] trees;
-
+  
   private SingleTreePredictions[] callbacks;
-
+  
   private DataConverter converter;
-
+  
   private int partition = -1;
-
+  
   /** num treated instances */
   private int instanceId;
-
   
   @Override
   protected void setup(Context context) throws IOException, InterruptedException {
@@ -67,45 +65,46 @@
     
     // get the cached files' paths
     URI[] files = DistributedCache.getCacheFiles(conf);
-
-    log.info("DistributedCache.getCacheFiles(): {}", ArrayUtils.toString(files));
     
-    if (files == null || files.length < 2) {
+    Step2Mapper.log.info("DistributedCache.getCacheFiles(): {}", ArrayUtils.toString(files));
+    
+    if ((files == null) || (files.length < 2)) {
       throw new IllegalArgumentException("missing paths from the DistributedCache");
     }
-
+    
     Path datasetPath = new Path(files[0].getPath());
     Dataset dataset = Dataset.load(conf, datasetPath);
-
+    
     int numMaps = Builder.getNumMaps(conf);
     int p = conf.getInt("mapred.task.partition", -1);
-
+    
     // total number of trees in the forest
     int numTrees = Builder.getNbTrees(conf);
     if (numTrees == -1) {
       throw new IllegalArgumentException("numTrees not found !");
     }
-
-    int nbConcerned = nbConcerned(numMaps, numTrees, p);
+    
+    int nbConcerned = Step2Mapper.nbConcerned(numMaps, numTrees, p);
     keys = new TreeID[nbConcerned];
     trees = new Node[nbConcerned];
-
+    
     Path forestPath = new Path(files[1].getPath());
     FileSystem fs = forestPath.getFileSystem(conf);
-    int numInstances = InterResults.load(fs, forestPath, numMaps, numTrees,
-        p, keys, trees);
-
-    log.debug("partition: {} numInstances: {}", p, numInstances);
+    int numInstances = InterResults.load(fs, forestPath, numMaps, numTrees, p, keys, trees);
+    
+    Step2Mapper.log.debug("partition: {} numInstances: {}", p, numInstances);
     configure(p, dataset, keys, trees, numInstances);
   }
-
+  
   /**
-   * Compute the number of trees that need to classify the instances of this
-   * mapper's partition
+   * Compute the number of trees that need to classify the instances of this mapper's partition
    * 
-   * @param numMaps total number of map tasks
-   * @param numTrees total number of trees in the forest
-   * @param partition mapper's partition
+   * @param numMaps
+   *          total number of map tasks
+   * @param numTrees
+   *          total number of trees in the forest
+   * @param partition
+   *          mapper's partition
    * @return
    */
   public static int nbConcerned(int numMaps, int numTrees, int partition) {
@@ -115,63 +114,64 @@
     // the trees of the mapper's partition are not concerned
     return numTrees - Step1Mapper.nbTrees(numMaps, numTrees, partition);
   }
-
+  
   /**
    * Useful for testing. Configures the mapper without using a JobConf<br>
    * TODO we don't need the keys partitions, the tree ids should suffice
    * 
-   * @param partition mapper's partition
+   * @param partition
+   *          mapper's partition
    * @param dataset
-   * @param keys keys returned by the first step
-   * @param trees trees returned by the first step
-   * @param numInstances number of instances in the mapper's partition
+   * @param keys
+   *          keys returned by the first step
+   * @param trees
+   *          trees returned by the first step
+   * @param numInstances
+   *          number of instances in the mapper's partition
    */
-  public void configure(int partition, Dataset dataset, TreeID[] keys,
-      Node[] trees, int numInstances) {
+  public void configure(int partition, Dataset dataset, TreeID[] keys, Node[] trees, int numInstances) {
     this.partition = partition;
     if (partition < 0) {
       throw new IllegalArgumentException("Wrong partition id : " + partition);
     }
-
+    
     converter = new DataConverter(dataset);
-
+    
     if (keys.length != trees.length) {
       throw new IllegalArgumentException("keys.length != trees.length");
     }
     int nbConcerned = keys.length;
-
+    
     this.keys = keys;
     this.trees = trees;
-
+    
     // make sure the trees are not from this partition
     for (TreeID key : keys) {
       if (key.partition() == partition) {
         throw new IllegalArgumentException("a tree from this partition was found !");
       }
     }
-
+    
     // init the callbacks
     callbacks = new SingleTreePredictions[nbConcerned];
     for (int index = 0; index < nbConcerned; index++) {
       callbacks[index] = new SingleTreePredictions(numInstances);
     }
-
+    
   }
-
   
   @Override
   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
-
+    
     Instance instance = converter.convert(instanceId, value.toString());
-
+    
     for (int index = 0; index < keys.length; index++) {
       int prediction = trees[index].classify(instance);
       callbacks[index].prediction(index, instanceId, prediction);
     }
-
+    
     instanceId++;
   }
-
   
   @Override
   protected void cleanup(Context context) throws IOException, InterruptedException {
@@ -180,5 +180,5 @@
       context.write(key, new MapredOutput(callbacks[index].getPredictions()));
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/partial/TreeID.java Sat Feb 13 20:27:25 2010
@@ -23,11 +23,11 @@
  * Indicates both the tree and the data partition used to grow the tree
  */
 public class TreeID extends LongWritable implements Cloneable {
-
+  
   public static final int MAX_TREEID = 100000;
-  public TreeID() {
-  }
-
+  
+  public TreeID() {}
+  
   public TreeID(int partition, int treeId) {
     if (partition < 0) {
       throw new IllegalArgumentException("partition < 0");
@@ -37,21 +37,22 @@
     }
     set(partition, treeId);
   }
-
+  
   public void set(int partition, int treeId) {
-    super.set((long) partition * MAX_TREEID + treeId);
+    super.set((long) partition * TreeID.MAX_TREEID + treeId);
   }
   
   /**
    * Data partition (InputSplit's index) that was used to grow the tree
+   * 
    * @return
    */
   public int partition() {
-    return (int) (get() / MAX_TREEID);
+    return (int) (get() / TreeID.MAX_TREEID);
   }
   
   public int treeId() {
-    return (int) (get() % MAX_TREEID);
+    return (int) (get() % TreeID.MAX_TREEID);
   }
   
   @Override

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/CategoricalNode.java Sat Feb 13 20:27:25 2010
@@ -28,20 +28,19 @@
 
 public class CategoricalNode extends Node {
   private int attr;
-
+  
   private double[] values;
-
+  
   private Node[] childs;
-
-  public CategoricalNode() {
-  }
+  
+  public CategoricalNode() {}
   
   public CategoricalNode(int attr, double[] values, Node[] childs) {
     this.attr = attr;
     this.values = values;
     this.childs = childs;
   }
-
+  
   @Override
   public int classify(Instance instance) {
     int index = ArrayUtils.indexOf(values, instance.get(attr));
@@ -51,49 +50,51 @@
     }
     return childs[index].classify(instance);
   }
-
+  
   @Override
   public long maxDepth() {
     long max = 0;
-
+    
     for (Node child : childs) {
       long depth = child.maxDepth();
-      if (depth > max)
+      if (depth > max) {
         max = depth;
+      }
     }
-
+    
     return 1 + max;
   }
-
+  
   @Override
   public long nbNodes() {
     long nbNodes = 1;
-
+    
     for (Node child : childs) {
       nbNodes += child.nbNodes();
     }
-
+    
     return nbNodes;
   }
-
+  
   @Override
   protected Type getType() {
     return Type.CATEGORICAL;
   }
-
+  
   @Override
   public boolean equals(Object obj) {
-    if (this == obj)
+    if (this == obj) {
       return true;
-    if (obj == null || !(obj instanceof CategoricalNode))
+    }
+    if ((obj == null) || !(obj instanceof CategoricalNode)) {
       return false;
-
+    }
+    
     CategoricalNode node = (CategoricalNode) obj;
-
-    return attr == node.attr && Arrays.equals(values, node.values)
-        && Arrays.equals(childs, node.childs);
+    
+    return (attr == node.attr) && Arrays.equals(values, node.values) && Arrays.equals(childs, node.childs);
   }
-
+  
   @Override
   public int hashCode() {
     int hashCode = attr;
@@ -110,20 +111,20 @@
   protected String getString() {
     StringBuilder buffer = new StringBuilder();
     
-    for (Node child:childs) {
+    for (Node child : childs) {
       buffer.append(child).append(',');
     }
     
     return buffer.toString();
   }
-
+  
   @Override
   public void readFields(DataInput in) throws IOException {
     attr = in.readInt();
     values = DFUtils.readDoubleArray(in);
     childs = DFUtils.readNodeArray(in);
   }
-
+  
   @Override
   protected void writeNode(DataOutput out) throws IOException {
     out.writeInt(attr);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Leaf.java Sat Feb 13 20:27:25 2010
@@ -29,29 +29,28 @@
  */
 public class Leaf extends Node {
   private int label;
-
-  protected Leaf() {
-  }
-
+  
+  protected Leaf() {}
+  
   public Leaf(int label) {
     this.label = label;
   }
-
+  
   @Override
   public int classify(Instance instance) {
     return label;
   }
-
+  
   @Override
   public long maxDepth() {
     return 1;
   }
-
+  
   @Override
   public long nbNodes() {
     return 1;
   }
-
+  
   /**
    * Extract a Leaf Node
    * 
@@ -60,42 +59,44 @@
    */
   static Leaf parse(StringTokenizer tokenizer) {
     int label = Integer.parseInt(tokenizer.nextToken());
-
+    
     return new Leaf(label);
   }
-
+  
   @Override
   protected Type getType() {
     return Type.LEAF;
   }
-
+  
   @Override
   public boolean equals(Object obj) {
-    if (this == obj)
+    if (this == obj) {
       return true;
-    if (obj == null || !(obj instanceof Leaf))
+    }
+    if ((obj == null) || !(obj instanceof Leaf)) {
       return false;
-
+    }
+    
     Leaf leaf = (Leaf) obj;
-
+    
     return label == leaf.label;
   }
-
+  
   @Override
   public int hashCode() {
     return label;
   }
- 
+  
   @Override
   protected String getString() {
     return "";
   }
-
+  
   @Override
   public void readFields(DataInput in) throws IOException {
     label = in.readInt();
   }
-
+  
   @Override
   protected void writeNode(DataOutput out) throws IOException {
     out.writeInt(label);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/MockLeaf.java Sat Feb 13 20:27:25 2010
@@ -19,22 +19,21 @@
 import org.apache.mahout.df.data.Instance;
 
 /**
- * Custom Leaf node that returns for each instance its own label. Used mainly
- * for testing purposes
+ * Custom Leaf node that returns for each instance its own label. Used mainly for testing purposes
  * 
  */
 public class MockLeaf extends Leaf {
-
+  
   @Override
   public int classify(Instance instance) {
     return instance.label;
   }
-
+  
   @Override
   protected Type getType() {
     return Type.MOCKLEAF;
   }
-
+  
   @Override
   protected String getString() {
     return "[MockLeaf]";

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/Node.java Sat Feb 13 20:27:25 2010
@@ -28,11 +28,14 @@
  * Represents an abstract node of a decision tree
  */
 public abstract class Node implements Writable {
-
+  
   protected enum Type {
-    MOCKLEAF, LEAF, NUMERICAL, CATEGORICAL
+    MOCKLEAF,
+    LEAF,
+    NUMERICAL,
+    CATEGORICAL
   }
-
+  
   /**
    * predicts the label for the instance
    * 
@@ -40,27 +43,27 @@
    * @return -1 if the label cannot be predicted
    */
   public abstract int classify(Instance instance);
-
+  
   /**
    * returns the total number of nodes of the tree
    * 
    * @return
    */
   public abstract long nbNodes();
-
+  
   /**
    * returns the maximum depth of the tree
    * 
    * @return
    */
   public abstract long maxDepth();
-
+  
   protected abstract Type getType();
-
+  
   public static Node read(DataInput in) throws IOException {
     Type type = Type.values()[in.readInt()];
     Node node;
-
+    
     switch (type) {
       case MOCKLEAF:
         node = new MockLeaf();
@@ -75,28 +78,27 @@
         node = new CategoricalNode();
         break;
       default:
-        throw new IllegalStateException(
-            "This implementation is not currently supported");
+        throw new IllegalStateException("This implementation is not currently supported");
     }
-
+    
     node.readFields(in);
-
+    
     return node;
   }
-
+  
   @Override
   public final String toString() {
     return getType() + ":" + getString() + ';';
   }
-
+  
   protected abstract String getString();
-
+  
   @Override
   public final void write(DataOutput out) throws IOException {
     out.writeInt(getType().ordinal());
     writeNode(out);
   }
-
+  
   protected abstract void writeNode(DataOutput out) throws IOException;
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/node/NumericalNode.java Sat Feb 13 20:27:25 2010
@@ -29,62 +29,64 @@
 public class NumericalNode extends Node {
   /** numerical attribute to split for */
   private int attr;
-
+  
   /** split value */
   private double split;
-
+  
   /** child node when attribute's value < split value */
   private Node loChild;
-
+  
   /** child node when attribute's value >= split value */
   private Node hiChild;
-
-  public NumericalNode() {
-  }
-
+  
+  public NumericalNode() {}
+  
   public NumericalNode(int attr, double split, Node loChild, Node hiChild) {
     this.attr = attr;
     this.split = split;
     this.loChild = loChild;
     this.hiChild = hiChild;
   }
-
+  
   @Override
   public int classify(Instance instance) {
-    if (instance.get(attr) < split)
+    if (instance.get(attr) < split) {
       return loChild.classify(instance);
-    else
+    } else {
       return hiChild.classify(instance);
+    }
   }
-
+  
   @Override
   public long maxDepth() {
     return 1 + Math.max(loChild.maxDepth(), hiChild.maxDepth());
   }
-
+  
   @Override
   public long nbNodes() {
     return 1 + loChild.nbNodes() + hiChild.nbNodes();
   }
-
+  
   @Override
   protected Type getType() {
     return Type.NUMERICAL;
-  }  
-
+  }
+  
   @Override
   public boolean equals(Object obj) {
-    if (this == obj)
+    if (this == obj) {
       return true;
-    if (obj == null || !(obj instanceof NumericalNode))
+    }
+    if ((obj == null) || !(obj instanceof NumericalNode)) {
       return false;
-
+    }
+    
     NumericalNode node = (NumericalNode) obj;
-
-    return attr == node.attr && split == node.split
-        && loChild.equals(node.loChild) && hiChild.equals(node.hiChild);
+    
+    return (attr == node.attr) && (split == node.split) && loChild.equals(node.loChild)
+           && hiChild.equals(node.hiChild);
   }
-
+  
   @Override
   public int hashCode() {
     return attr + (int) Double.doubleToLongBits(split) + loChild.hashCode() + hiChild.hashCode();
@@ -94,7 +96,7 @@
   protected String getString() {
     return loChild.toString() + ',' + hiChild.toString();
   }
-
+  
   @Override
   public void readFields(DataInput in) throws IOException {
     attr = in.readInt();
@@ -102,7 +104,7 @@
     loChild = Node.read(in);
     hiChild = Node.read(in);
   }
-
+  
   @Override
   protected void writeNode(DataOutput out) throws IOException {
     out.writeInt(attr);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/ref/SequentialBuilder.java Sat Feb 13 20:27:25 2010
@@ -34,41 +34,45 @@
  * Builds a Random Decision Forest using a given TreeBuilder to grow the trees
  */
 public class SequentialBuilder {
-
+  
   private static final Logger log = LoggerFactory.getLogger(SequentialBuilder.class);
-
+  
   private final Random rng;
   
   private final Bagging bagging;
-
+  
   /**
    * Constructor
    * 
-   * @param rng random-numbers generator
-   * @param treeBuilder tree builder
-   * @param data training data
+   * @param rng
+   *          random-numbers generator
+   * @param treeBuilder
+   *          tree builder
+   * @param data
+   *          training data
    */
   public SequentialBuilder(Random rng, TreeBuilder treeBuilder, Data data) {
     this.rng = rng;
     bagging = new Bagging(treeBuilder, data);
   }
-
+  
   public DecisionForest build(int nbTrees, PredictionCallback callback) {
     List<Node> trees = new ArrayList<Node>();
     
     for (int treeId = 0; treeId < nbTrees; treeId++) {
       trees.add(bagging.build(treeId, rng, callback));
-      logProgress(((float) treeId + 1) / nbTrees);
+      SequentialBuilder.logProgress(((float) treeId + 1) / nbTrees);
     }
-
+    
     return new DecisionForest(trees);
   }
   
   private static void logProgress(float progress) {
     int percent = (int) (progress * 100);
-    if (percent % 10 == 0)
-      log.info(String.format("Building %2d%%", percent));
-
+    if (percent % 10 == 0) {
+      SequentialBuilder.log.info(String.format("Building %2d%%", percent));
+    }
+    
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/DefaultIgSplit.java Sat Feb 13 20:27:25 2010
@@ -26,17 +26,17 @@
  * Default, not optimized, implementation of IgSplit
  */
 public class DefaultIgSplit extends IgSplit {
-
+  
   /** used by entropy() */
   private int[] counts;
-
+  
   @Override
   public Split computeSplit(Data data, int attr) {
     if (data.getDataset().isNumerical(attr)) {
       double[] values = data.values(attr);
       double bestIg = -1;
       double bestSplit = 0.0;
-
+      
       for (double value : values) {
         double ig = numericalIg(data, attr, value);
         if (ig > bestIg) {
@@ -44,15 +44,15 @@
           bestSplit = value;
         }
       }
-
+      
       return new Split(attr, bestIg, bestSplit);
     } else {
       double ig = categoricalIg(data, attr);
-
+      
       return new Split(attr, ig);
     }
   }
-
+  
   /**
    * Computes the Information Gain for a CATEGORICAL attribute
    * 
@@ -65,18 +65,17 @@
     double hy = entropy(data); // H(Y)
     double hyx = 0.0; // H(Y|X)
     double invDataSize = 1.0 / data.size();
-
+    
     for (double value : values) {
       Data subset = data.subset(Condition.equals(attr, value));
       hyx += subset.size() * invDataSize * entropy(subset);
     }
-
+    
     return hy - hyx;
   }
-
+  
   /**
-   * Computes the Information Gain for a NUMERICAL attribute given a splitting
-   * value
+   * Computes the Information Gain for a NUMERICAL attribute given a splitting value
    * 
    * @param data
    * @param attr
@@ -86,18 +85,18 @@
   protected double numericalIg(Data data, int attr, double split) {
     double hy = entropy(data);
     double invDataSize = 1.0 / data.size();
-
+    
     // LO subset
     Data subset = data.subset(Condition.lesser(attr, split));
     hy -= subset.size() * invDataSize * entropy(subset);
-
+    
     // HI subset
     subset = data.subset(Condition.greaterOrEquals(attr, split));
     hy -= subset.size() * invDataSize * entropy(subset);
-
+    
     return hy;
   }
-
+  
   /**
    * Computes the Entropy
    * 
@@ -106,23 +105,25 @@
    */
   protected double entropy(Data data) {
     double invDataSize = 1.0 / data.size();
-
-    if (counts == null)
+    
+    if (counts == null) {
       counts = new int[data.getDataset().nblabels()];
-
+    }
+    
     Arrays.fill(counts, 0);
     data.countLabels(counts);
-
+    
     double entropy = 0.0;
     for (int label = 0; label < data.getDataset().nblabels(); label++) {
       int count = counts[label];
-      if (count == 0)
+      if (count == 0) {
         continue; // otherwise we get a NaN
+      }
       double p = count * invDataSize;
-      entropy += -p * Math.log(p) / LOG2;
+      entropy += -p * Math.log(p) / IgSplit.LOG2;
     }
-
+    
     return entropy;
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/IgSplit.java Sat Feb 13 20:27:25 2010
@@ -23,9 +23,9 @@
  * Computes the best split using the Information Gain measure
  */
 public abstract class IgSplit {
-
+  
   protected static final double LOG2 = Math.log(2.0);
-
+  
   /**
    * Computes the best split for the given attribute
    * 
@@ -34,5 +34,5 @@
    * @return
    */
   public abstract Split computeSplit(Data data, int attr);
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/OptIgSplit.java Sat Feb 13 20:27:25 2010
@@ -28,22 +28,22 @@
  * Optimized implementation of IgSplit
  */
 public class OptIgSplit extends IgSplit {
-
+  
   private int[][] counts;
-
+  
   private int[] countAll;
-
+  
   private int[] countLess;
-
+  
   @Override
   public Split computeSplit(Data data, int attr) {
     if (data.getDataset().isNumerical(attr)) {
       return numericalSplit(data, attr);
     } else {
-      return categoricalSplit(data, attr);
+      return OptIgSplit.categoricalSplit(data, attr);
     }
   }
-
+  
   /**
    * Computes the split for a CATEGORICAL attribute
    * 
@@ -55,28 +55,28 @@
     double[] values = data.values(attr);
     int[][] counts = new int[values.length][data.getDataset().nblabels()];
     int[] countAll = new int[data.getDataset().nblabels()];
-
+    
     // compute frequencies
     for (int index = 0; index < data.size(); index++) {
       Instance instance = data.get(index);
       counts[ArrayUtils.indexOf(values, instance.get(attr))][instance.label]++;
       countAll[instance.label]++;
     }
-
+    
     int size = data.size();
-    double hy = entropy(countAll, size); // H(Y)
+    double hy = OptIgSplit.entropy(countAll, size); // H(Y)
     double hyx = 0.0; // H(Y|X)
     double invDataSize = 1.0 / size;
-
+    
     for (int index = 0; index < values.length; index++) {
       size = DataUtils.sum(counts[index]);
-      hyx += size * invDataSize * entropy(counts[index], size);
+      hyx += size * invDataSize * OptIgSplit.entropy(counts[index], size);
     }
-
+    
     double ig = hy - hyx;
     return new Split(attr, ig);
   }
-
+  
   /**
    * Return the sorted list of distinct values for the given attribute
    * 
@@ -87,10 +87,10 @@
   private static double[] sortedValues(Data data, int attr) {
     double[] values = data.values(attr);
     Arrays.sort(values);
-
+    
     return values;
   }
-
+  
   /**
    * Instantiates the counting arrays
    * 
@@ -102,7 +102,7 @@
     countAll = new int[data.getDataset().nblabels()];
     countLess = new int[data.getDataset().nblabels()];
   }
-
+  
   protected void computeFrequencies(Data data, int attr, double[] values) {
     for (int index = 0; index < data.size(); index++) {
       Instance instance = data.get(index);
@@ -119,68 +119,72 @@
    * @return
    */
   protected Split numericalSplit(Data data, int attr) {
-    double[] values = sortedValues(data, attr);
-
+    double[] values = OptIgSplit.sortedValues(data, attr);
+    
     initCounts(data, values);
-
+    
     computeFrequencies(data, attr, values);
-
+    
     int size = data.size();
-    double hy = entropy(countAll, size);
+    double hy = OptIgSplit.entropy(countAll, size);
     double invDataSize = 1.0 / size;
-
+    
     int best = -1;
     double bestIg = -1.0;
-
+    
     // try each possible split value
     for (int index = 0; index < values.length; index++) {
       double ig = hy;
-
+      
       // instance with attribute value < values[index]
       size = DataUtils.sum(countLess);
-      ig -= size * invDataSize * entropy(countLess, size);
-
+      ig -= size * invDataSize * OptIgSplit.entropy(countLess, size);
+      
       // instance with attribute value >= values[index]
       size = DataUtils.sum(countAll);
-      ig -= size * invDataSize * entropy(countAll, size);
-
+      ig -= size * invDataSize * OptIgSplit.entropy(countAll, size);
+      
       if (ig > bestIg) {
         bestIg = ig;
         best = index;
       }
-
+      
       DataUtils.add(countLess, counts[index]);
       DataUtils.dec(countAll, counts[index]);
     }
-
+    
     if (best == -1) {
       throw new IllegalStateException("no best split found !");
     }
     return new Split(attr, bestIg, values[best]);
   }
-
+  
   /**
    * Computes the Entropy
    * 
-   * @param counts counts[i] = numInstances with label i
-   * @param dataSize numInstances
+   * @param counts
+   *          counts[i] = numInstances with label i
+   * @param dataSize
+   *          numInstances
    * @return
    */
   private static double entropy(int[] counts, int dataSize) {
-    if (dataSize == 0)
+    if (dataSize == 0) {
       return 0.0;
-
+    }
+    
     double entropy = 0.0;
     double invDataSize = 1.0 / dataSize;
-
+    
     for (int count : counts) {
-      if (count == 0)
+      if (count == 0) {
         continue; // otherwise we get a NaN
+      }
       double p = count * invDataSize;
-      entropy += -p * Math.log(p) / LOG2;
+      entropy += -p * Math.log(p) / IgSplit.LOG2;
     }
-
+    
     return entropy;
   }
-
+  
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/split/Split.java Sat Feb 13 20:27:25 2010
@@ -17,27 +17,26 @@
 
 package org.apache.mahout.df.split;
 
-
 /**
  * Contains enough information to identify each split
  */
 public class Split {
-
+  
   /** attribute to split for */
   public final int attr;
-
+  
   /** Information Gain of the split */
   public final double ig;
-
+  
   /** split value for NUMERICAL attributes */
   public final double split;
-
+  
   public Split(int attr, double ig, double split) {
     this.attr = attr;
     this.ig = ig;
     this.split = split;
   }
-
+  
   public Split(int attr, double ig) {
     this(attr, ig, Double.NaN);
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Describe.java Sat Feb 13 20:27:25 2010
@@ -34,11 +34,11 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.df.data.DataLoader;
 import org.apache.mahout.df.data.Dataset;
 import org.apache.mahout.df.data.DescriptorException;
 import org.apache.mahout.df.data.DescriptorUtils;
-import org.apache.mahout.common.CommandLineUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -46,86 +46,81 @@
  * Generates a file descriptor for a given dataset
  */
 public class Describe {
-
+  
   private static final Logger log = LoggerFactory.getLogger(Describe.class);
-
-  private Describe() {
-  }
-
+  
+  private Describe() {}
+  
   public static void main(String[] args) throws IOException, DescriptorException {
-
+    
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option pathOpt = obuilder.withLongName("path").withShortName("p")
-        .withRequired(true).withArgument(
-            abuilder.withName("path").withMinimum(1).withMaximum(1).create())
-        .withDescription("Data path").create();
-
-    Option descriptorOpt = obuilder.withLongName("descriptor").withShortName(
-        "d").withRequired(true).withArgument(
-        abuilder.withName("descriptor").withMinimum(1).create())
-        .withDescription("data descriptor").create();
-
-    Option descPathOpt = obuilder.withLongName("file").withShortName("f")
-        .withRequired(true).withArgument(
-            abuilder.withName("file").withMinimum(1).withMaximum(1).create())
-        .withDescription("Path to generated descriptor file").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription(
-        "Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(
-        descPathOpt).withOption(descriptorOpt).withOption(helpOpt).create();
-
+    
+    Option pathOpt = obuilder.withLongName("path").withShortName("p").withRequired(true).withArgument(
+      abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+    
+    Option descriptorOpt = obuilder.withLongName("descriptor").withShortName("d").withRequired(true)
+        .withArgument(abuilder.withName("descriptor").withMinimum(1).create()).withDescription(
+          "data descriptor").create();
+    
+    Option descPathOpt = obuilder.withLongName("file").withShortName("f").withRequired(true).withArgument(
+      abuilder.withName("file").withMinimum(1).withMaximum(1).create()).withDescription(
+      "Path to generated descriptor file").create();
+    
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
+    
+    Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(descPathOpt).withOption(
+      descriptorOpt).withOption(helpOpt).create();
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
       }
-
+      
       String dataPath = cmdLine.getValue(pathOpt).toString();
       String descPath = cmdLine.getValue(descPathOpt).toString();
-      List<String> descriptor = convert(cmdLine.getValues(descriptorOpt));
-
-      log.debug("Data path : {}", dataPath);
-      log.debug("Descriptor path : {}", descPath);
-      log.debug("Descriptor : {}", descriptor);
-
-      runTool(dataPath, descriptor, descPath);
+      List<String> descriptor = Describe.convert(cmdLine.getValues(descriptorOpt));
+      
+      Describe.log.debug("Data path : {}", dataPath);
+      Describe.log.debug("Descriptor path : {}", descPath);
+      Describe.log.debug("Descriptor : {}", descriptor);
+      
+      Describe.runTool(dataPath, descriptor, descPath);
     } catch (OptionException e) {
-      log.warn(e.toString(), e);
+      Describe.log.warn(e.toString(), e);
       CommandLineUtil.printHelp(group);
     }
   }
-
-  private static void runTool(String dataPath, List<String> description,
-      String filePath) throws DescriptorException, IOException {
-    log.info("Generating the descriptor...");
+  
+  private static void runTool(String dataPath, List<String> description, String filePath) throws DescriptorException,
+                                                                                         IOException {
+    Describe.log.info("Generating the descriptor...");
     String descriptor = DescriptorUtils.generateDescriptor(description);
-
-    Path fPath = validateOutput(filePath);
     
-    log.info("generating the dataset...");
-    Dataset dataset = generateDataset(descriptor, dataPath);
-
-    log.info("storing the dataset description");
-    storeWritable(new Configuration(), fPath, dataset);
+    Path fPath = Describe.validateOutput(filePath);
+    
+    Describe.log.info("generating the dataset...");
+    Dataset dataset = Describe.generateDataset(descriptor, dataPath);
+    
+    Describe.log.info("storing the dataset description");
+    Describe.storeWritable(new Configuration(), fPath, dataset);
   }
-
-  private static Dataset generateDataset(String descriptor, String dataPath)
-      throws IOException, DescriptorException {
+  
+  private static Dataset generateDataset(String descriptor, String dataPath) throws IOException,
+                                                                            DescriptorException {
     Path path = new Path(dataPath);
     FileSystem fs = path.getFileSystem(new Configuration());
-
+    
     return DataLoader.generateDataset(descriptor, fs, path);
   }
-
+  
   private static Path validateOutput(String filePath) throws IOException {
     Path path = new Path(filePath);
     FileSystem fs = path.getFileSystem(new Configuration());
@@ -143,11 +138,10 @@
     }
     return list;
   }
-
-  private static void storeWritable(Configuration conf, Path path, Writable dataset) 
-      throws IOException {
-    FileSystem fs = path.getFileSystem(conf);
   
+  private static void storeWritable(Configuration conf, Path path, Writable dataset) throws IOException {
+    FileSystem fs = path.getFileSystem(conf);
+    
     FSDataOutputStream out = fs.create(path);
     dataset.write(out);
     out.close();

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java?rev=909900&r1=909899&r2=909900&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/tools/Frequencies.java Sat Feb 13 20:27:25 2010
@@ -17,6 +17,9 @@
 
 package org.apache.mahout.df.tools;
 
+import java.io.IOException;
+import java.util.Arrays;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -35,97 +38,90 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.util.Arrays;
-
 /**
  * Compute the frequency distribution of the "class label"
  */
-public class Frequencies extends Configured implements Tool  {
-
+public class Frequencies extends Configured implements Tool {
+  
   private static final Logger log = LoggerFactory.getLogger(Frequencies.class);
-
-  private Frequencies() {
-  }
-
+  
+  private Frequencies() {}
+  
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
-
+    
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option dataOpt = obuilder.withLongName("data").withShortName("d")
-        .withRequired(true).withArgument(
-            abuilder.withName("path").withMinimum(1).withMaximum(1).create())
-        .withDescription("Data path").create();
-
-    Option datasetOpt = obuilder.withLongName("dataset").withShortName(
-        "ds").withRequired(true).withArgument(
-        abuilder.withName("path").withMinimum(1).create())
-        .withDescription("dataset path").create();
-
-    Option helpOpt = obuilder.withLongName("help").withDescription(
-        "Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(
-        datasetOpt).withOption(helpOpt).create();
-
+    
+    Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument(
+      abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+    
+    Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument(
+      abuilder.withName("path").withMinimum(1).create()).withDescription("dataset path").create();
+    
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
+    
+    Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(helpOpt)
+        .create();
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return 0;
       }
-
+      
       String dataPath = cmdLine.getValue(dataOpt).toString();
       String datasetPath = cmdLine.getValue(datasetOpt).toString();
-
-      log.debug("Data path : {}", dataPath);
-      log.debug("Dataset path : {}", datasetPath);
-
+      
+      Frequencies.log.debug("Data path : {}", dataPath);
+      Frequencies.log.debug("Dataset path : {}", datasetPath);
+      
       runTool(dataPath, datasetPath);
     } catch (OptionException e) {
-      log.warn(e.toString(), e);
+      Frequencies.log.warn(e.toString(), e);
       CommandLineUtil.printHelp(group);
     }
-
+    
     return 0;
   }
-
-  private void runTool(String data, String dataset)
-          throws IOException, ClassNotFoundException, InterruptedException {
-
+  
+  private void runTool(String data, String dataset) throws IOException,
+                                                   ClassNotFoundException,
+                                                   InterruptedException {
+    
     FileSystem fs = FileSystem.get(getConf());
     Path workingDir = fs.getWorkingDirectory();
-
+    
     Path dataPath = new Path(data);
     Path datasetPath = new Path(dataset);
-
-    log.info("Computing the frequencies...");
+    
+    Frequencies.log.info("Computing the frequencies...");
     FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);
-
+    
     int[][] counts = job.run(getConf());
-
+    
     // compute the partitions' sizes
     int numPartitions = counts.length;
-//    int[] sizes = new int[numPartitions]; // TODO this isn't used?
-//    for (int p = 0; p < numPartitions; p++) {
-//      sizes[p] = DataUtils.sum(counts[p]);
-//    }
-
+    // int[] sizes = new int[numPartitions]; // TODO this isn't used?
+    // for (int p = 0; p < numPartitions; p++) {
+    // sizes[p] = DataUtils.sum(counts[p]);
+    // }
+    
     // outputing the frequencies
-    log.info("counts[partition][class]");
+    Frequencies.log.info("counts[partition][class]");
     for (int p = 0; p < numPartitions; p++) {
-      log.info(Arrays.toString(counts[p]));
+      Frequencies.log.info(Arrays.toString(counts[p]));
     }
   }
-
+  
   public static void main(String[] args) throws Exception {
     ToolRunner.run(new Configuration(), new Frequencies(), args);
   }
-
+  
 }
\ No newline at end of file