You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2008/08/19 14:55:48 UTC
svn commit: r687042 [2/4] - in /lucene/mahout/trunk: core/ core/src/main/java/org/apache/mahout/classifier/ core/src/main/java/org/apache/mahout/classifier/bayes/ core/src/main/java/org/apache/mahout/classifier/bayes/common/ core/src/main/java/org/apac...

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ *  Can also be used as a local Combiner. A simple summing reducer
+ *
+ **/
+
+public class BayesFeatureReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+  public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
+   
+    float sum = 0;
+    while (values.hasNext()) {
+      sum += values.next().get();
+    }
+    output.collect(key, new FloatWritable(sum));
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,111 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+
+
+/**
+ * The Driver which drives the Tf-Idf Generation
+ *
+ **/
+public class BayesTfIdfDriver {
+  /**
+   * Takes in two arguments:
+   * <ol>
+   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the interim files as a {@link org.apache.hadoop.io.SequenceFile}</li>
+   * </ol>
+   * @param args The args
+   */
+  public static void main(String[] args) {
+    String input = args[0];
+    String output = args[1];
+
+    runJob(input, output);
+  }
+
+  /**
+   * Run the job
+   *
+   * @param input            the input pathname String
+   * @param output           the output pathname String
+
+   */
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(BayesTfIdfDriver.class);
+    
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(FloatWritable.class);
+    
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount"));
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq"));
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount"));
+    Path outPath = new Path(output + "/trainer-tfIdf");
+    SequenceFileOutputFormat.setOutputPath(conf, outPath);
+    conf.setNumMapTasks(100);
+    
+    conf.setMapperClass(BayesTfIdfMapper.class);
+    conf.setInputFormat(SequenceFileInputFormat.class);
+    conf.setCombinerClass(BayesTfIdfReducer.class);
+    conf.setReducerClass(BayesTfIdfReducer.class);    
+    conf.setOutputFormat(BayesTfIdfOutputFormat.class);
+    
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+     try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      
+      SequenceFileModelReader reader = new SequenceFileModelReader();
+      
+      Path interimFile = new Path(output+"/trainer-docCount/part-*");      
+      
+      HashMap<String,Float> labelDocumentCounts= reader.readLabelDocumentCounts(dfs, interimFile, conf);
+
+      DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf,GenericsUtil.getClass(labelDocumentCounts));
+      
+      String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
+      System.out.println("Counts of documents in Each Label");
+      HashMap<String,Float> c = mapStringifier.fromString(labelDocumentCountString);
+      System.out.println(c);
+      
+      conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
+      
+      client.setConf(conf);    
+    
+      JobClient.runJob(conf);      
+      
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,104 @@
+package org.apache.mahout.classifier.bayes.common;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.GenericsUtil;
+
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * 
+ * 
+ */
+public class BayesTfIdfMapper extends MapReduceBase implements
+    Mapper<Text, FloatWritable, Text, FloatWritable> {
+
+  public HashMap<String, Float> labelDocumentCounts = null;
+  String labelDocumentCountString =" ";
+  /**
+   * We need to calculate the Tf-Idf of each feature in each label
+   * 
+   * @param key The label,feature pair (can either be the freq Count or the term
+   *        Document count
+   * @param value
+   * @param output
+   * @param reporter
+   * @throws IOException
+   */
+  public void map(Text key, FloatWritable value,
+      OutputCollector<Text, FloatWritable> output, Reporter reporter)
+      throws IOException {
+ 
+    String labelFeaturePair = key.toString();
+   
+
+    if (labelFeaturePair.startsWith("-")) { // if it is the termDocumentCount
+      labelFeaturePair = labelFeaturePair.substring(1);
+      String label = labelFeaturePair.split(",")[0];
+      
+      if(labelDocumentCounts.containsKey(label)==false){
+        
+        throw new IOException(label);
+      }
+      
+      Float labelDocumentCount = labelDocumentCounts.get(label);
+      float logIdf = (float)Math.log(labelDocumentCount.floatValue()  / value.get());
+      
+      output.collect(new Text(labelFeaturePair), new FloatWritable(logIdf));
+    } 
+    else if (labelFeaturePair.startsWith(",")) {
+      output.collect(new Text("*vocabCount"), new FloatWritable(1.0f));
+    }
+    else {
+      output.collect(key, value);
+    }
+  }
+  
+  @Override
+  public void configure(JobConf job) {
+    try
+    {
+      if(labelDocumentCounts ==null){
+        labelDocumentCounts = new HashMap<String, Float>();
+
+        DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(job,GenericsUtil.getClass(labelDocumentCounts));
+
+        labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);  
+        labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
+        
+        
+        labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString);
+      }
+    }
+    catch(IOException ex){
+      
+      ex.printStackTrace();
+    }
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,59 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * This class extends the MultipleOutputFormat, allowing to write the output data to different output files in sequence file output format.
+ */
+public class BayesTfIdfOutputFormat extends
+    MultipleOutputFormat<WritableComparable, Writable> {
+
+  private SequenceFileOutputFormat theSequenceFileOutputFormat = null;
+
+  @Override
+  protected RecordWriter<WritableComparable, Writable> getBaseRecordWriter(
+      FileSystem fs, JobConf job, String name, Progressable arg3)
+      throws IOException {
+    if (theSequenceFileOutputFormat == null) {
+      theSequenceFileOutputFormat = new SequenceFileOutputFormat();
+    }
+    return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
+  }
+  @Override
+  protected String generateFileNameForKeyValue(WritableComparable k, Writable v,
+      String name) {
+    Text key = (Text)k;
+
+    if(key.toString().startsWith("*"))
+      return "trainer-vocabCount/"+name;
+    else
+      return "trainer-tfIdf/"+name;
+  }
+
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,62 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ *  Can also be used as a local Combiner beacuse only two values should be there inside the values
+ *
+ **/
+
+public class BayesTfIdfReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+  public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
+    String token = key.toString();  
+    if(token.startsWith("*vocabCount"))
+    {
+      float vocabCount = 0;
+      while (values.hasNext()) {
+        vocabCount += values.next().get();
+      }
+      System.out.println(token + "\t"+vocabCount);
+      output.collect(key, new FloatWritable(vocabCount));
+    }
+    else
+    {
+      float idfTimes_D_ij = 1.0f;
+      int numberofValues = 0;
+      while (values.hasNext()) {
+        idfTimes_D_ij *= values.next().get();
+        numberofValues ++;
+      }
+      //System.out.println(token + "\t" + numberofValues + "\t"+idfTimes_D_ij);
+      //if(numberofValues!=2) throw new IOException("Number of values should be exactly 2");
+      
+      output.collect(key, new FloatWritable(idfTimes_D_ij));
+    }
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,88 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ **/
+public class BayesWeightSummerDriver {
+  /**
+   * Takes in two arguments:
+   * <ol>
+   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the the interim files as a {@link org.apache.hadoop.io.SequenceFile}</li>
+   * </ol>
+   * @param args The args
+   */
+  public static void main(String[] args) {
+    String input = args[0];
+    String output = args[1];
+
+    runJob(input, output);
+  }
+
+  /**
+   * Run the job
+   *
+   * @param input            the input pathname String
+   * @param output           the output pathname String
+
+   */
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(BayesWeightSummerDriver.class);
+    
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(FloatWritable.class);
+    
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
+    Path outPath = new Path(output + "/trainer-weights");
+    SequenceFileOutputFormat.setOutputPath(conf, outPath);
+    //conf.setNumReduceTasks(1);
+    conf.setNumMapTasks(100);
+    conf.setMapperClass(BayesWeightSummerMapper.class);
+    //see the javadoc for the spec for file input formats: first token is key, rest is input.  Whole document on one line
+    conf.setInputFormat(SequenceFileInputFormat.class);
+    conf.setCombinerClass(BayesWeightSummerReducer.class);
+    conf.setReducerClass(BayesWeightSummerReducer.class);    
+    conf.setOutputFormat(BayesWeightSummerOutputFormat.class);
+      try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      client.setConf(conf);    
+    
+      JobClient.runJob(conf);      
+      
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,64 @@
+package org.apache.mahout.classifier.bayes.common;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+
+import java.io.IOException;
+
+/**
+ * 
+ * 
+ */
+public class BayesWeightSummerMapper extends MapReduceBase implements
+    Mapper<Text, FloatWritable, Text, FloatWritable> {
+
+
+  /**
+   * We need to calculate the idf of each feature in each label
+   * 
+   * @param key The label,feature pair (can either be the freq Count or the term
+   *        Document count
+   * @param value
+   * @param output
+   * @param reporter
+   * @throws IOException
+   */
+  public void map(Text key, FloatWritable value,
+      OutputCollector<Text, FloatWritable> output, Reporter reporter)
+      throws IOException {
+
+    String labelFeaturePair = key.toString();
+    int i = labelFeaturePair.indexOf(",");
+    
+    String label = labelFeaturePair.substring(0,i);
+    String feature = labelFeaturePair.substring(i+1);
+    
+    output.collect(new Text("," + feature), value);//sum of weight for all labels for a feature Sigma_j
+    output.collect(new Text("_" + label), value);//sum of weight for all features for a label Sigma_k
+    output.collect(new Text("*"), value);//sum of weight of all features for all label Sigma_kSigma_j
+    
+
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,62 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * This class extends the MultipleOutputFormat, allowing to write the output data to different output files in sequence file output format.
+ */
+public class BayesWeightSummerOutputFormat extends
+    MultipleOutputFormat<WritableComparable, Writable> {
+
+  private SequenceFileOutputFormat theSequenceFileOutputFormat = null;
+
+  @Override
+  protected RecordWriter<WritableComparable, Writable> getBaseRecordWriter(
+      FileSystem fs, JobConf job, String name, Progressable arg3)
+      throws IOException {
+    if (theSequenceFileOutputFormat == null) {
+      theSequenceFileOutputFormat = new SequenceFileOutputFormat();
+    }
+    return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
+  }
+  @Override
+  protected String generateFileNameForKeyValue(WritableComparable k, Writable v,
+      String name) {
+    Text key = (Text)k;
+    
+    if(key.toString().startsWith("*"))//sum of weight of all features for all label Sigma_kSigma_j
+      return "Sigma_kSigma_j/"+name;
+    else if(key.toString().startsWith(","))//sum of weight for all labels for a feature Sigma_j
+      return "Sigma_j/"+name;
+    else if(key.toString().startsWith("_")) //sum of weights for all features for a label Sigma_k
+      return "Sigma_k/"+name;
+    return "JunkFileThisShouldNotHappen";
+  }
+
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ *  Can also be used as a local Combiner 
+ *
+ **/
+
+public class BayesWeightSummerReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+  public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+    //Key is label,word, value is the tfidf of the feature  of times we've seen this label word per local node.  Output is the same
+
+    float sum = 0;
+    while (values.hasNext()) {
+      sum += values.next().get();
+    }
+    output.collect(key, new FloatWritable(sum));
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,321 @@
+package org.apache.mahout.classifier.bayes.io;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.common.Model;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.conf.Configuration;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * This Class reads the different interim  files created during the Training stage as well as the Model File during testing.
+ * 
+ */
+public class SequenceFileModelReader {
+
+  public Model loadModel(Model model, FileSystem fs, Map<String, Path> pathPatterns,
+      Configuration conf) throws IOException {
+
+    loadFeatureWeights(model, fs, pathPatterns.get("sigma_j"), conf);
+    loadLabelWeights(model, fs, pathPatterns.get("sigma_k"), conf); 
+    loadSumWeight(model, fs, pathPatterns.get("sigma_kSigma_j"), conf); 
+    loadThetaNormalizer(model, fs, pathPatterns.get("thetaNormalizer"), conf); 
+    
+   
+    model.initializeWeightMatrix();
+    
+    loadWeightMatrix(model, fs, pathPatterns.get("weight"), conf);
+    model.InitializeNormalizer();
+    //model.GenerateComplementaryModel();
+    return model;
+  }
+
+  public Model loadWeightMatrix(Model model, FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
+
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      System.out.println(path.toString());
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+
+        int idx = keyStr.indexOf(",");
+        if (idx != -1) {
+          model.loadFeatureWeight(keyStr.substring(0, idx), keyStr.substring(idx + 1), value.get());
+        }
+
+      }
+    }
+
+    return model;
+  }
+
+  public Model loadFeatureWeights(Model model, FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+    
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      System.out.println(path.toString());
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+
+        if (keyStr.startsWith(",")) { // Sum of weights for a Feature
+          model.setSumFeatureWeight(keyStr.substring(1),
+              value.get());
+        }
+      }
+    }
+    return model;
+  }
+
+  public Model loadLabelWeights(Model model,FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      System.out.println(path.toString());
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+
+        if (keyStr.startsWith("_")) { // Sum of weights in a Label
+          model.setSumLabelWeight(keyStr.substring(1), value
+              .get());
+        }
+      }
+    }
+
+    return model;
+  }
+  
+  public Model loadThetaNormalizer(Model model,FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      System.out.println(path.toString());
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();        
+        if (keyStr.startsWith("_")) { // Sum of weights in a Label
+          model.setThetaNormalizer(keyStr.substring(1), value
+              .get());
+        }
+      }
+    }
+
+    return model;
+  }
+
+  public Model loadSumWeight(Model model, FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      System.out.println(path.toString());
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+
+        if (keyStr.startsWith("*")) { // Sum of weights for all Feature
+          // and all Labels
+          model.setSigma_jSigma_k(value.get());
+          System.out.println(value.get());
+        }
+      }
+    }
+    return model;
+  }
+
+  public void createMapFile(FileSystem fs, Path pathPattern, Configuration conf)
+      throws IOException {
+
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+    MapFile.Writer writer = new MapFile.Writer(conf, fs, "data.mapfile",
+        Text.class, FloatWritable.class);
+    MapFile.Writer.setIndexInterval(conf, 3);
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      System.out.println(path.toString());
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+
+        if (keyStr.startsWith("_")) {
+
+        } else if (keyStr.startsWith(",")) {
+
+        } else if (keyStr.startsWith("*")) {
+
+        } else {
+          int idx = keyStr.indexOf(",");
+          if (idx != -1) {
+            HashMap<String, Float> data = new HashMap<String, Float>();
+            data.put(keyStr.substring(0, idx), new Float(value.get()));
+            writer.append(new Text(key.toString()), value);
+          }
+        }
+      }
+    }
+    writer.close();
+    // return model;
+  }
+
+  public HashMap<String, Float> readLabelSums(FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+    HashMap<String, Float> labelSum = new HashMap<String, Float>();
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+   
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+        if (keyStr.startsWith("_")) { // Sum of weights of labels
+          labelSum.put(keyStr.substring(1), new Float(value.get()));
+        }
+
+      }
+    }
+
+    return labelSum;
+  }
+
+  public HashMap<String, Float> readLabelDocumentCounts(FileSystem fs,
+      Path pathPattern, Configuration conf) throws IOException {
+    HashMap<String, Float> labelDocumentCounts = new HashMap<String, Float>();
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      // the key is either _label_ or label,feature
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+        if (keyStr.startsWith("_")) { // Count of Documents in a Label
+          labelDocumentCounts.put(keyStr.substring(1), new Float(value.get()));
+        }
+
+      }
+    }
+
+    return labelDocumentCounts;
+  }
+
+  public Float readSigma_jSigma_k(FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+    HashMap<String, Float> weightSum = new HashMap<String, Float>();
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      // the key is *
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+        if (weightSum.size() > 1) {
+          throw new IOException("Incorrect Sum File");
+        } else if (keyStr.startsWith("*")) {
+          weightSum.put(keyStr, new Float(value.get()));
+        }
+
+      }
+    }
+
+    Float sigma_jSigma_k = weightSum.get("*");
+    return sigma_jSigma_k;
+  }
+
+  public Float readVocabCount(FileSystem fs, Path pathPattern,
+      Configuration conf) throws IOException {
+    HashMap<String, Float> weightSum = new HashMap<String, Float>();
+    Writable key = new Text();
+    FloatWritable value = new FloatWritable();
+
+    FileStatus[] outputFiles = fs.globStatus(pathPattern);
+    for (FileStatus fileStatus : outputFiles) {
+      Path path = fileStatus.getPath();
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+      // the key is *
+      while (reader.next(key, value)) {
+        String keyStr = key.toString();
+        if (weightSum.size() > 1) {
+          throw new IOException("Incorrect vocabCount File");
+        }
+        if (keyStr.startsWith("*")) {
+          weightSum.put(keyStr, new Float(value.get()));
+        }
+
+      }
+    }
+
+    Float sigma_jSigma_k = weightSum.get("*vocabCount");
+    return sigma_jSigma_k;
+  }
+
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html Tue Aug 19 05:55:45 2008
@@ -0,0 +1,89 @@
+<!--
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+-->
+<HEAD>
+  <TITLE>org.apache.mahout.classifier.bayes</TITLE>
+</HEAD>
+<BODY>
+<DIV><h2>Introduction</h2>
+  The bayes package provides an implementation of a Map Reduce enabled naive bayes classifier. The naive bayes
+  classifier is a very simple classifier that counts the occurrences of words in association with a label which
+  can then be used to determine the likelihood that a new document, and it's words, should be assigned a particular
+  label.
+</DIV>
+<div><h2>Implementation</h2>
+
+  <p>The implementation is divided up into three parts:
+  <ol>
+    <li>The Trainer -- responsible for doing the counting of the words and the labels</li>
+    <li>The Model -- responsible for holding the training data in a useful way</li>
+    <li>The Classifier -- responsible for using the trainers output to determine the category of previously unseen
+      documents
+    </li>
+  </ol>
+  </p>
+  <div><h3>The Trainer</h3>
+
+    <p>The trainer is manifested in several classes:
+    <ol>
+      <li>{@link org.apache.mahout.classifier.bayes.BayesDriver} -- Creates the Hadoop Naive Bayes job and outputs
+        the model. This Driver encapsulates a lot of intermediate Map-Reduce Classes
+      </li>
+      <li>{@link org.apache.mahout.classifier.bayes.common.BayesFeatureDriver} 
+      </li>
+      <li>{@link org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver} 
+      </li>
+      <li>{@link org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver}
+      </li>
+      <li>{@link org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver}
+      </li>
+    </ol>
+    The trainer assumes that the input files are in the {@link org.apache.hadoop.mapred.KeyValueTextInputFormat}, i.e.
+    the first token of the line
+    is the label and separated from the remaining tokens on the line by a tab-delimiter. The remaining tokens are the unique features (words). Thus, input documents might look
+    like:
+    <pre>
+      hockey puck stick goalie forward defenseman referee ice checking slapshot helmet
+      football field football pigskin referee helmet turf tackle
+    </pre>
+    where hockey and football are the labels and the remaining words are the features associated with those particular
+    labels.</p>
+    <p>The output from the trainer is a {@link org.apache.hadoop.io.SequenceFile}.</p>
+  </div>
+  <div><h3>The Model</h3>
+    <p>The {@link org.apache.mahout.classifier.bayes.BayesModel} is the data structure used to represent the results of the training
+    for use by the {@link org.apache.mahout.classifier.bayes.BayesClassifier}.  A Model can be created by hand, or, if using
+    the {@link org.apache.mahout.classifier.bayes.BayesDriver}, it can be created from the {@link
+      org.apache.hadoop.io.SequenceFile} that is output.  To create it from the SequenceFile, use the
+      {@link org.apache.mahout.classifier.bayes.io.SequenceFileModelReader} located in the io subpackage.</p>
+  </div>
+  <div><h3>The Classifier</h3>
+
+    <p>The {@link org.apache.mahout.classifier.bayes.BayesClassifier} is responsible for using a {@link
+      org.apache.mahout.classifier.bayes.BayesModel} to classify
+      documents into categories.</p>
+  </div>
+</div>
+
+<DIV>&nbsp;</DIV>
+<DIV align="center">
+  Copyright &copy; 2008 <A HREF="http://www.apache.org">Apache Software Foundation</A>
+</DIV>
+</BODY>
+</HTML>
\ No newline at end of file

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,134 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.util.PriorityQueue;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.common.Classifier;
+import org.apache.mahout.common.Model;
+
+import java.util.Collection;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.LinkedList;
+
+/**
+ * Classifies documents based on a {@link CBayesModel}.  
+ */
+public class CBayesClassifier implements Classifier{
+
+  /**
+   * Classify the document and return the top <code>numResults</code>
+   *
+   * @param model           The model
+   * @param document        The document to classify
+   * @param defaultCategory The default category to assign
+   * @param numResults      The maximum number of results to return, ranked by score.  Ties are broken by comparing the category
+   * @return A Collection of {@link org.apache.mahout.classifier.ClassifierResult}s.
+   */
+  public Collection<ClassifierResult> classify(Model model, String[] document, String defaultCategory, int numResults) {
+    Collection<String> categories = model.getLabels();
+    PriorityQueue pq = new ClassifierResultPriorityQueue(numResults);
+    ClassifierResult tmp = null;
+    for (String category : categories){
+      float prob = documentProbability(model, category, document);
+      if (prob < 0) {
+        tmp = new ClassifierResult(category, prob);
+        pq.insert(tmp);
+      }
+    }
+
+    LinkedList<ClassifierResult> result = new LinkedList<ClassifierResult>();
+    while ((tmp = (ClassifierResult) pq.pop()) != null) {
+      result.addLast(tmp);
+    }
+    if (result.isEmpty()){
+      result.add(new ClassifierResult(defaultCategory, 0));
+    }
+    return result;
+  }
+
+  /**
+   * Classify the document according to the {@link org.apache.mahout.common.Model}
+   *
+   * @param model           The trained {@link org.apache.mahout.common.Model}
+   * @param document        The document to classify
+   * @param defaultCategory The default category to assign if one cannot be determined
+   * @return The single best category
+   */
+  public ClassifierResult classify(Model model, String[] document, String defaultCategory) {
+    ClassifierResult result = new ClassifierResult(defaultCategory);
+    float min = 0.0f;
+    Collection<String> categories = model.getLabels();
+
+    for (String category : categories) {
+      float prob = documentProbability(model, category, document);
+      if (prob < min) {
+        min = prob;
+        result.setLabel(category);
+      }
+    }
+    result.setScore(min);
+    return result;
+  }
+
+  /**
+   * Calculate the document probability as the multiplication of the {@link org.apache.mahout.common.Model#FeatureWeight(String, String)} for each word given the label
+   *
+   * @param model       The {@link org.apache.mahout.common.Model}
+   * @param label       The label to calculate the probability of
+   * @param document    The document
+   * @return The probability
+   * @see Model#FeatureWeight(String, String)
+   */
+  public float documentProbability(Model model, String label, String[] document) {
+    float result = 0.0f;
+    Hashtable<String, Integer> wordList = new Hashtable<String, Integer>(1000);
+    for (String word : document) {
+      if (wordList.containsKey(word)) {
+        Integer count = wordList.get(word);
+        count++;
+        wordList.put(word, count);
+      } else {
+        wordList.put(word, 1);
+      }      
+    }
+    for (Enumeration<String> e = wordList.keys(); e.hasMoreElements();) {      
+      String word = e.nextElement();
+      Integer count = wordList.get(word);
+      result += count * model.FeatureWeight(label, word);
+    }
+    return result;
+  }
+
+  
+  private static class ClassifierResultPriorityQueue extends PriorityQueue {
+
+    private ClassifierResultPriorityQueue(int numResults) {
+      initialize(numResults);
+    }
+
+    protected boolean lessThan(Object a, Object b) {
+      ClassifierResult cr1 = (ClassifierResult) a;
+      ClassifierResult cr2 = (ClassifierResult) b;
+
+      float score1 = cr1.getScore();
+      float score2 = cr2.getScore();
+      return score1 == score2 ? cr1.getLabel().compareTo(cr2.getLabel()) < 0 : score1 < score2;
+    }
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,126 @@
+package org.apache.mahout.classifier.cbayes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.common.BayesFeatureDriver;
+import org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver;
+import org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver;
+
+/**
+ * Create and run the Bayes Trainer.
+ * 
+ */
+public class CBayesDriver {
+  /**
+   * Takes in two arguments:
+   * <ol>
+   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents
+   * live</li>
+   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the
+   * {@link org.apache.mahout.common.Model} as a
+   * {@link org.apache.hadoop.io.SequenceFile}</li>
+   * </ol>
+   * 
+   * @param args The args
+   */
+  public static void main(String[] args) {
+    String input = args[0];
+    String output = args[1];
+
+    runJob(input, output, 1);
+  }
+
+  /**
+   * Run the job
+   * 
+   * @param input the input pathname String
+   * @param output the output pathname String
+   * 
+   */
+  @SuppressWarnings("deprecation")
+  public static void runJob(String input, String output, int gramSize) {
+    JobConf conf = new JobConf(CBayesDriver.class);
+    try {
+      FileSystem dfs = FileSystem.get(conf);
+      Path outPath = new Path(output);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath);
+      
+      System.out.println("Reading features...");
+      //Read the features in each document normalized by length of each document
+      BayesFeatureDriver.runJob(input, output, gramSize);
+      
+      System.out.println("Calculating Tf-Idf...");
+      //Calculate the TfIdf for each word in each label
+      BayesTfIdfDriver.runJob(input, output);
+      
+      System.out.println("Calculating weight sums for labels and features...");
+      //Calculate the Sums of weights for each label, for each feature and for each feature and for each label
+      BayesWeightSummerDriver.runJob(input, output);
+      
+      //System.out.println("Calculating the weight of the features of each label in the complement class...");
+      //Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
+      //CBayesThetaDriver.runJob(input, output);
+      
+      System.out.println("Calculating the weight Normalisation factor for each complement class...");
+      //Calculate the normalization factor Sigma_W_ij for each complement class. 
+      CBayesThetaNormalizerDriver.runJob(input, output);
+      
+      //System.out.println("Calculating the final Weight Normalized Complementary Naive Bayes Model...");
+      //Calculate the normalization factor Sigma_W_ij for each complement class. 
+      //CBayesNormalizedWeightDriver.runJob(input, output);
+      
+      Path docCountOutPath = new Path(output+ "/trainer-docCount");
+      if (dfs.exists(docCountOutPath))
+        dfs.delete(docCountOutPath, true);
+      Path termDocCountOutPath = new Path(output+ "/trainer-termDocCount");
+      if (dfs.exists(termDocCountOutPath))
+        dfs.delete(termDocCountOutPath, true);
+      Path featureCountOutPath = new Path(output+ "/trainer-featureCount");
+      if (dfs.exists(featureCountOutPath))
+        dfs.delete(featureCountOutPath, true);
+      Path wordFreqOutPath = new Path(output+ "/trainer-wordFreq");
+      if (dfs.exists(wordFreqOutPath))
+        dfs.delete(wordFreqOutPath, true);
+      Path vocabCountPath = new Path(output+ "/trainer-tfIdf/trainer-vocabCount");
+      if (dfs.exists(vocabCountPath))
+        dfs.delete(vocabCountPath, true);
+      /*Path tfIdfOutPath = new Path(output+ "/trainer-tfIdf");
+      if (dfs.exists(tfIdfOutPath))
+        dfs.delete(tfIdfOutPath, true);*/
+      Path vocabCountOutPath = new Path(output+ "/trainer-vocabCount");
+      if (dfs.exists(vocabCountOutPath))
+        dfs.delete(vocabCountOutPath, true);
+     /* Path weightsOutPath = new Path(output+ "/trainer-weights");
+      if (dfs.exists(weightsOutPath))
+        dfs.delete(weightsOutPath, true);*/
+      /*Path thetaOutPath = new Path(output+ "/trainer-theta");
+      if (dfs.exists(thetaOutPath))
+        dfs.delete(thetaOutPath, true);*/
+      /*Path thetaNormalizerOutPath = new Path(output+ "/trainer-thetaNormalizer");
+      if (dfs.exists(thetaNormalizerOutPath))
+        dfs.delete(thetaNormalizerOutPath, true);*/
+      
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,194 @@
+package org.apache.mahout.classifier.cbayes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.mahout.common.Model;
+
+import java.util.Map;
+
+
+/**
+ * 
+ * 
+ */
+public class CBayesModel extends Model {
+
+  @Override
+  protected float getWeight(Integer label, Integer feature) {
+    float result = 0.0f;
+    Map<Integer, Float> featureWeights = featureLabelWeights.get(feature);
+
+    if (featureWeights.containsKey(label)) {
+      result = featureWeights.get(label).floatValue();
+    }
+    float vocabCount = featureList.size();
+    float sumLabelWeight = getSumLabelWeight(label);
+    float sigma_j = getSumFeatureWeight(feature);
+
+    float numerator = sigma_j - result + alpha_i;
+    float denominator =(sigma_jSigma_k - sumLabelWeight + vocabCount);
+    
+    float weight = new Double(Math.log(numerator /denominator)).floatValue();
+    result = (-1.0f * (weight / getThetaNormalizer(label)));
+    return result;
+  }
+
+  @Override
+  protected float getWeightUnprocessed(Integer label, Integer feature) {
+    float result = 0.0f;
+    Map<Integer, Float> featureWeights = featureLabelWeights.get(feature);
+
+    if (featureWeights.containsKey(label)) {
+      result = featureWeights.get(label).floatValue();
+    } else {
+      result = 0;
+    }
+    return result;
+  }
+
+  @Override
+  public void InitializeNormalizer() {
+    float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+
+    
+    System.out.println(thetaNormalizer);
+    for (Integer label : thetaNormalizer.keySet()) {
+      float Sigma_W_ij = thetaNormalizer.get(label);
+      if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
+        perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+      }
+    }
+
+    for (Integer label : thetaNormalizer.keySet()) {
+      float Sigma_W_ij = thetaNormalizer.get(label);
+      thetaNormalizer.put(label, Sigma_W_ij
+          / perLabelWeightSumNormalisationFactor);
+    }
+    System.out.println(thetaNormalizer);
+    
+    /*for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+      thetaNormalizer.put(label, new Float(0));
+    }
+    for (int feature = 0, maxFeatures = featureList.size(); feature < maxFeatures; feature++) {
+      for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+
+        float D_ij = getWeightUnprocessed(label, feature);
+        float sumLabelWeight = getSumLabelWeight(label);
+        float sigma_j = getSumFeatureWeight(feature);
+        float vocabCount = featureList.size();
+        
+        float numerator = (sigma_j ) + alpha_i;
+        float denominator = (sigma_jSigma_k - sumLabelWeight + vocabCount);
+        float denominator1 = 0.5f *(sigma_jSigma_k/vocabCount + D_ij * (float)maxLabels);
+        Float weight = (float) Math.log(numerator / denominator) + (float) Math.log( 1 - D_ij/denominator1 );
+        
+        thetaNormalizer.put(label, weight+thetaNormalizer.get(label));
+        
+      }
+    }
+    perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+    System.out.println(thetaNormalizer);
+    for (Integer label : thetaNormalizer.keySet()) {
+      float Sigma_W_ij = thetaNormalizer.get(label);
+      if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
+        perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+      }
+    }
+
+    for (Integer label : thetaNormalizer.keySet()) {
+      float Sigma_W_ij = thetaNormalizer.get(label);
+      thetaNormalizer.put(label, Sigma_W_ij
+          / perLabelWeightSumNormalisationFactor);
+    }
+    System.out.println(thetaNormalizer);*/
+  }
+
+  @Override
+  public void GenerateModel() {
+    try {
+      float vocabCount = featureList.size();
+
+      float[] perLabelThetaNormalizer = new float[labelList.size()];
+
+      float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+
+      for (int feature = 0, maxFeatures = featureList.size(); feature < maxFeatures; feature++) {
+        for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+
+          float D_ij = getWeightUnprocessed(label, feature);
+          float sumLabelWeight = getSumLabelWeight(label);
+          float sigma_j = getSumFeatureWeight(feature);
+
+          float numerator = (sigma_j - D_ij) + alpha_i;
+          float denominator = (sigma_jSigma_k - sumLabelWeight) + vocabCount;
+
+          Float weight = (float) Math.log(numerator / denominator);
+
+          if (D_ij != 0)
+            setWeight(label, feature, weight);
+
+          perLabelThetaNormalizer[label] += weight;
+
+        }
+      }
+      System.out.println("Normalizing Weights");
+      for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+        float Sigma_W_ij = perLabelThetaNormalizer[label];
+        if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
+          perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+        }
+      }
+
+      for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+        float Sigma_W_ij = perLabelThetaNormalizer[label];
+        perLabelThetaNormalizer[label] = Sigma_W_ij
+            / perLabelWeightSumNormalisationFactor;
+      }
+
+      for (int feature = 0, maxFeatures = featureList.size(); feature < maxFeatures; feature++) {
+        for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+          float W_ij = getWeightUnprocessed(label, feature);
+          if (W_ij == 0)
+            continue;
+          float Sigma_W_ij = perLabelThetaNormalizer[label];
+          float normalizedWeight = -1.0f * (W_ij / Sigma_W_ij);
+          setWeight(label, feature, normalizedWeight);
+        }
+      }
+    } catch (Exception e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+    }
+  }
+
+  
+  /**
+   * Get the weighted probability of the feature.
+   * 
+   * @param label The label of the feature
+   * @param feature The feature to calc. the prob. for
+   * @return The weighted probability
+   */
+  @Override
+  public float FeatureWeight(Integer label, Integer feature) {
+    float weight = getWeight(label, feature);
+    return weight;
+  }
+
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,122 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ **/
+public class CBayesNormalizedWeightDriver {
+  /**
+   * Takes in two arguments:
+   * <ol>
+   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the {@link org.apache.mahout.common.Model} as a {@link org.apache.hadoop.io.SequenceFile}</li>
+   * </ol>
+   * @param args The args
+   */
+  public static void main(String[] args) {
+    String input = args[0];
+    String output = args[1];
+
+    runJob(input, output);
+  }
+
+  /**
+   * Run the job
+   *
+   * @param input            the input pathname String
+   * @param output           the output pathname String
+
+   */
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class);
+    
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(FloatWritable.class);
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-theta"));
+    Path outPath = new Path(output + "/trainer-weight");
+    SequenceFileOutputFormat.setOutputPath(conf, outPath);
+    conf.setNumMapTasks(100);
+    //conf.setNumReduceTasks(1);
+    conf.setMapperClass(CBayesNormalizedWeightMapper.class);
+    conf.setInputFormat(SequenceFileInputFormat.class);
+    conf.setCombinerClass(CBayesNormalizedWeightReducer.class);    
+    conf.setReducerClass(CBayesNormalizedWeightReducer.class);    
+    conf.setOutputFormat(SequenceFileOutputFormat.class);
+    
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+     try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      
+      SequenceFileModelReader reader = new SequenceFileModelReader();
+      
+      Path thetaNormalizationsFiles = new Path(output+"/trainer-thetaNormalizer/part*");         
+      HashMap<String,Float> thetaNormalizer= reader.readLabelSums(dfs, thetaNormalizationsFiles, conf);
+      float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+      for(String label: thetaNormalizer.keySet())
+      {
+        
+        float Sigma_W_ij = thetaNormalizer.get(label);
+        if(perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)){
+          perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+        }
+      } 
+      
+      for(String label: thetaNormalizer.keySet())
+      {        
+        float Sigma_W_ij = thetaNormalizer.get(label);
+        thetaNormalizer.put(label, Sigma_W_ij / perLabelWeightSumNormalisationFactor) ;      
+      }
+      
+      
+      DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(thetaNormalizer));     
+      String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);
+      
+      HashMap<String,Float> c = mapStringifier.fromString(thetaNormalizationsString);      
+      System.out.println(c);
+      conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString);
+      
+     
+      client.setConf(conf);    
+    
+      JobClient.runJob(conf);      
+      
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,85 @@
+package org.apache.mahout.classifier.cbayes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.GenericsUtil;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * 
+ * 
+ */
+public class CBayesNormalizedWeightMapper extends MapReduceBase implements
+    Mapper<Text, FloatWritable, Text, FloatWritable> {
+
+  public HashMap<String, Float> thetaNormalizer = null;
+
+  String thetaNormalizationsString = " ";
+
+  /**
+   * We need to calculate the idf of each feature in each label
+   * 
+   * @param key The label,feature pair (can either be the freq Count or the term
+   *        Document count
+   * @param value
+   * @param output
+   * @param reporter
+   * @throws IOException
+   */
+  public void map(Text key, FloatWritable value,
+      OutputCollector<Text, FloatWritable> output, Reporter reporter)
+      throws IOException {
+
+    String labelFeaturePair = key.toString();
+
+    String label = labelFeaturePair.split(",")[0];
+    output.collect(key, new FloatWritable((float)(-1.0f * (float)Math.log(value.get())/thetaNormalizer.get(label))));// output -D_ij
+
+  }
+
+  @Override
+  public void configure(JobConf job) {
+    try {
+      if (thetaNormalizer == null) {
+        thetaNormalizer = new HashMap<String, Float>();
+
+        DefaultStringifier<HashMap<String, Float>> mapStringifier = new DefaultStringifier<HashMap<String, Float>>(
+            job, GenericsUtil.getClass(thetaNormalizer));
+
+        thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);
+        thetaNormalizationsString = job.get("cnaivebayes.thetaNormalizations",
+            thetaNormalizationsString);
+        thetaNormalizer = mapStringifier.fromString(thetaNormalizationsString);
+
+      }
+    } catch (IOException ex) {
+
+      ex.printStackTrace();
+    }
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,53 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ *  Can also be used as a local Combiner beacuse only two values should be there inside the values
+ *
+ **/
+
+public class CBayesNormalizedWeightReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+  
+
+  
+  public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
+    String token = key.toString();  
+    float weight = 0.0f;
+    while (values.hasNext()) {
+      weight += values.next().get();
+    }
+    if(token.equalsIgnoreCase(new String("rec.motorcycles,miller")))
+      System.out.println(token + "=>" + weight);
+    output.collect(key, new FloatWritable(weight));
+  }
+
+ 
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,127 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ **/
+public class CBayesThetaDriver {
+  /**
+   * Takes in two arguments:
+   * <ol>
+   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+   * <li>The output {@link org.apache.hadoop.fs.Path} where to write the {@link org.apache.mahout.common.Model} as a {@link org.apache.hadoop.io.SequenceFile}</li>
+   * </ol>
+   * @param args The args
+   */
+  public static void main(String[] args) {
+    String input = args[0];
+    String output = args[1];
+
+    runJob(input, output);
+  }
+
+  /**
+   * Run the job
+   *
+   * @param input            the input pathname String
+   * @param output           the output pathname String
+
+   */
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(CBayesThetaDriver.class);
+    
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(FloatWritable.class);
+    
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-weights/Sigma_j"));
+    SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
+    Path outPath = new Path(output + "/trainer-theta");
+    SequenceFileOutputFormat.setOutputPath(conf, outPath);
+    //conf.setNumMapTasks(1);
+    //conf.setNumReduceTasks(1);
+    conf.setMapperClass(CBayesThetaMapper.class);
+    conf.setInputFormat(SequenceFileInputFormat.class);
+    //conf.setCombinerClass(CBayesThetaReducer.class);    
+    conf.setReducerClass(CBayesThetaReducer.class);    
+    conf.setOutputFormat(SequenceFileOutputFormat.class);
+    
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+     try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      
+      SequenceFileModelReader reader = new SequenceFileModelReader();
+      
+      Path Sigma_kFiles = new Path(output+"/trainer-weights/Sigma_k/*");         
+      HashMap<String,Float> labelWeightSum= reader.readLabelSums(dfs, Sigma_kFiles, conf);
+      DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(labelWeightSum));     
+      String labelWeightSumString = mapStringifier.toString(labelWeightSum);
+      
+      System.out.println("Sigma_k for Each Label");
+      HashMap<String,Float> c = mapStringifier.fromString(labelWeightSumString);      
+      System.out.println(c);
+      conf.set("cnaivebayes.sigma_k", labelWeightSumString);
+      
+      
+      Path sigma_kSigma_jFile = new Path(output+"/trainer-weights/Sigma_kSigma_j/*");         
+      Float sigma_jSigma_k = reader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
+      DefaultStringifier<Float> floatStringifier = new DefaultStringifier<Float>(conf, Float.class);     
+      String sigma_jSigma_kString = floatStringifier.toString(sigma_jSigma_k);
+      
+      System.out.println("Sigma_kSigma_j for each Label and for each Features");
+      Float retSigma_jSigma_k = floatStringifier.fromString(sigma_jSigma_kString);      
+      System.out.println(retSigma_jSigma_k);
+      conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);
+      
+      Path vocabCountFile = new Path(output+"/trainer-tfIdf/trainer-vocabCount/*"); 
+      Float vocabCount = reader.readVocabCount(dfs, vocabCountFile, conf);
+      String vocabCountString = floatStringifier.toString(vocabCount);
+      
+      System.out.println("Vocabulary Count");
+      conf.set("cnaivebayes.vocabCount", vocabCountString);
+      Float retvocabCount = floatStringifier.fromString(vocabCountString);
+      System.out.println(retvocabCount);
+      
+      client.setConf(conf);    
+    
+      JobClient.runJob(conf);      
+      
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    
+  }
+}

Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java
------------------------------------------------------------------------------
    svn:eol-style = native