You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2008/08/19 14:55:48 UTC
svn commit: r687042 [2/4] - in /lucene/mahout/trunk: core/
core/src/main/java/org/apache/mahout/classifier/
core/src/main/java/org/apache/mahout/classifier/bayes/
core/src/main/java/org/apache/mahout/classifier/bayes/common/
core/src/main/java/org/apac...
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ * Can also be used as a local Combiner. A simple summing reducer
+ *
+ **/
+
+public class BayesFeatureReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+ public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+ //Key is label,word, value is the number of times we've seen this label word per local node. Output is the same
+
+ float sum = 0;
+ while (values.hasNext()) {
+ sum += values.next().get();
+ }
+ output.collect(key, new FloatWritable(sum));
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesFeatureReducer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,111 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+
+
+/**
+ * The Driver which drives the Tf-Idf Generation
+ *
+ **/
+public class BayesTfIdfDriver {
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the interim files as a {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ * @param args The args
+ */
+ public static void main(String[] args) {
+ String input = args[0];
+ String output = args[1];
+
+ runJob(input, output);
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input the input pathname String
+ * @param output the output pathname String
+
+ */
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(BayesTfIdfDriver.class);
+
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(FloatWritable.class);
+
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount"));
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq"));
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount"));
+ Path outPath = new Path(output + "/trainer-tfIdf");
+ SequenceFileOutputFormat.setOutputPath(conf, outPath);
+ conf.setNumMapTasks(100);
+
+ conf.setMapperClass(BayesTfIdfMapper.class);
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setCombinerClass(BayesTfIdfReducer.class);
+ conf.setReducerClass(BayesTfIdfReducer.class);
+ conf.setOutputFormat(BayesTfIdfOutputFormat.class);
+
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+
+ SequenceFileModelReader reader = new SequenceFileModelReader();
+
+ Path interimFile = new Path(output+"/trainer-docCount/part-*");
+
+ HashMap<String,Float> labelDocumentCounts= reader.readLabelDocumentCounts(dfs, interimFile, conf);
+
+ DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf,GenericsUtil.getClass(labelDocumentCounts));
+
+ String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
+ System.out.println("Counts of documents in Each Label");
+ HashMap<String,Float> c = mapStringifier.fromString(labelDocumentCountString);
+ System.out.println(c);
+
+ conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
+
+ client.setConf(conf);
+
+ JobClient.runJob(conf);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfDriver.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,104 @@
+package org.apache.mahout.classifier.bayes.common;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.GenericsUtil;
+
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ *
+ *
+ */
+public class BayesTfIdfMapper extends MapReduceBase implements
+ Mapper<Text, FloatWritable, Text, FloatWritable> {
+
+ public HashMap<String, Float> labelDocumentCounts = null;
+ String labelDocumentCountString =" ";
+ /**
+ * We need to calculate the Tf-Idf of each feature in each label
+ *
+ * @param key The label,feature pair (can either be the freq Count or the term
+ * Document count
+ * @param value
+ * @param output
+ * @param reporter
+ * @throws IOException
+ */
+ public void map(Text key, FloatWritable value,
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
+
+ String labelFeaturePair = key.toString();
+
+
+ if (labelFeaturePair.startsWith("-")) { // if it is the termDocumentCount
+ labelFeaturePair = labelFeaturePair.substring(1);
+ String label = labelFeaturePair.split(",")[0];
+
+ if(labelDocumentCounts.containsKey(label)==false){
+
+ throw new IOException(label);
+ }
+
+ Float labelDocumentCount = labelDocumentCounts.get(label);
+ float logIdf = (float)Math.log(labelDocumentCount.floatValue() / value.get());
+
+ output.collect(new Text(labelFeaturePair), new FloatWritable(logIdf));
+ }
+ else if (labelFeaturePair.startsWith(",")) {
+ output.collect(new Text("*vocabCount"), new FloatWritable(1.0f));
+ }
+ else {
+ output.collect(key, value);
+ }
+ }
+
+ @Override
+ public void configure(JobConf job) {
+ try
+ {
+ if(labelDocumentCounts ==null){
+ labelDocumentCounts = new HashMap<String, Float>();
+
+ DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(job,GenericsUtil.getClass(labelDocumentCounts));
+
+ labelDocumentCountString = mapStringifier.toString(labelDocumentCounts);
+ labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString);
+
+
+ labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString);
+ }
+ }
+ catch(IOException ex){
+
+ ex.printStackTrace();
+ }
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfMapper.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,59 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * This class extends the MultipleOutputFormat, allowing to write the output data to different output files in sequence file output format.
+ */
+public class BayesTfIdfOutputFormat extends
+ MultipleOutputFormat<WritableComparable, Writable> {
+
+ private SequenceFileOutputFormat theSequenceFileOutputFormat = null;
+
+ @Override
+ protected RecordWriter<WritableComparable, Writable> getBaseRecordWriter(
+ FileSystem fs, JobConf job, String name, Progressable arg3)
+ throws IOException {
+ if (theSequenceFileOutputFormat == null) {
+ theSequenceFileOutputFormat = new SequenceFileOutputFormat();
+ }
+ return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
+ }
+ @Override
+ protected String generateFileNameForKeyValue(WritableComparable k, Writable v,
+ String name) {
+ Text key = (Text)k;
+
+ if(key.toString().startsWith("*"))
+ return "trainer-vocabCount/"+name;
+ else
+ return "trainer-tfIdf/"+name;
+ }
+
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfOutputFormat.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,62 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ * Can also be used as a local Combiner beacuse only two values should be there inside the values
+ *
+ **/
+
+public class BayesTfIdfReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+ public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+ //Key is label,word, value is the number of times we've seen this label word per local node. Output is the same
+ String token = key.toString();
+ if(token.startsWith("*vocabCount"))
+ {
+ float vocabCount = 0;
+ while (values.hasNext()) {
+ vocabCount += values.next().get();
+ }
+ System.out.println(token + "\t"+vocabCount);
+ output.collect(key, new FloatWritable(vocabCount));
+ }
+ else
+ {
+ float idfTimes_D_ij = 1.0f;
+ int numberofValues = 0;
+ while (values.hasNext()) {
+ idfTimes_D_ij *= values.next().get();
+ numberofValues ++;
+ }
+ //System.out.println(token + "\t" + numberofValues + "\t"+idfTimes_D_ij);
+ //if(numberofValues!=2) throw new IOException("Number of values should be exactly 2");
+
+ output.collect(key, new FloatWritable(idfTimes_D_ij));
+ }
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesTfIdfReducer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,88 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ **/
+public class BayesWeightSummerDriver {
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the the interim files as a {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ * @param args The args
+ */
+ public static void main(String[] args) {
+ String input = args[0];
+ String output = args[1];
+
+ runJob(input, output);
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input the input pathname String
+ * @param output the output pathname String
+
+ */
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(BayesWeightSummerDriver.class);
+
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(FloatWritable.class);
+
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
+ Path outPath = new Path(output + "/trainer-weights");
+ SequenceFileOutputFormat.setOutputPath(conf, outPath);
+ //conf.setNumReduceTasks(1);
+ conf.setNumMapTasks(100);
+ conf.setMapperClass(BayesWeightSummerMapper.class);
+ //see the javadoc for the spec for file input formats: first token is key, rest is input. Whole document on one line
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setCombinerClass(BayesWeightSummerReducer.class);
+ conf.setReducerClass(BayesWeightSummerReducer.class);
+ conf.setOutputFormat(BayesWeightSummerOutputFormat.class);
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+ client.setConf(conf);
+
+ JobClient.runJob(conf);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerDriver.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,64 @@
+package org.apache.mahout.classifier.bayes.common;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+
+import java.io.IOException;
+
+/**
+ *
+ *
+ */
+public class BayesWeightSummerMapper extends MapReduceBase implements
+ Mapper<Text, FloatWritable, Text, FloatWritable> {
+
+
+ /**
+ * We need to calculate the idf of each feature in each label
+ *
+ * @param key The label,feature pair (can either be the freq Count or the term
+ * Document count
+ * @param value
+ * @param output
+ * @param reporter
+ * @throws IOException
+ */
+ public void map(Text key, FloatWritable value,
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
+
+ String labelFeaturePair = key.toString();
+ int i = labelFeaturePair.indexOf(",");
+
+ String label = labelFeaturePair.substring(0,i);
+ String feature = labelFeaturePair.substring(i+1);
+
+ output.collect(new Text("," + feature), value);//sum of weight for all labels for a feature Sigma_j
+ output.collect(new Text("_" + label), value);//sum of weight for all features for a label Sigma_k
+ output.collect(new Text("*"), value);//sum of weight of all features for all label Sigma_kSigma_j
+
+
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerMapper.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,62 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * This class extends the MultipleOutputFormat, allowing to write the output data to different output files in sequence file output format.
+ */
+public class BayesWeightSummerOutputFormat extends
+ MultipleOutputFormat<WritableComparable, Writable> {
+
+ private SequenceFileOutputFormat theSequenceFileOutputFormat = null;
+
+ @Override
+ protected RecordWriter<WritableComparable, Writable> getBaseRecordWriter(
+ FileSystem fs, JobConf job, String name, Progressable arg3)
+ throws IOException {
+ if (theSequenceFileOutputFormat == null) {
+ theSequenceFileOutputFormat = new SequenceFileOutputFormat();
+ }
+ return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
+ }
+ @Override
+ protected String generateFileNameForKeyValue(WritableComparable k, Writable v,
+ String name) {
+ Text key = (Text)k;
+
+ if(key.toString().startsWith("*"))//sum of weight of all features for all label Sigma_kSigma_j
+ return "Sigma_kSigma_j/"+name;
+ else if(key.toString().startsWith(","))//sum of weight for all labels for a feature Sigma_j
+ return "Sigma_j/"+name;
+ else if(key.toString().startsWith("_")) //sum of weights for all features for a label Sigma_k
+ return "Sigma_k/"+name;
+ return "JunkFileThisShouldNotHappen";
+ }
+
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerOutputFormat.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.classifier.bayes.common;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ * Can also be used as a local Combiner
+ *
+ **/
+
+public class BayesWeightSummerReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+ public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+ //Key is label,word, value is the tfidf of the feature of times we've seen this label word per local node. Output is the same
+
+ float sum = 0;
+ while (values.hasNext()) {
+ sum += values.next().get();
+ }
+ output.collect(key, new FloatWritable(sum));
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/common/BayesWeightSummerReducer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,321 @@
+package org.apache.mahout.classifier.bayes.io;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.common.Model;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.conf.Configuration;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * This Class reads the different interim files created during the Training stage as well as the Model File during testing.
+ *
+ */
+public class SequenceFileModelReader {
+
+ public Model loadModel(Model model, FileSystem fs, Map<String, Path> pathPatterns,
+ Configuration conf) throws IOException {
+
+ loadFeatureWeights(model, fs, pathPatterns.get("sigma_j"), conf);
+ loadLabelWeights(model, fs, pathPatterns.get("sigma_k"), conf);
+ loadSumWeight(model, fs, pathPatterns.get("sigma_kSigma_j"), conf);
+ loadThetaNormalizer(model, fs, pathPatterns.get("thetaNormalizer"), conf);
+
+
+ model.initializeWeightMatrix();
+
+ loadWeightMatrix(model, fs, pathPatterns.get("weight"), conf);
+ model.InitializeNormalizer();
+ //model.GenerateComplementaryModel();
+ return model;
+ }
+
+ public Model loadWeightMatrix(Model model, FileSystem fs, Path pathPattern, Configuration conf) throws IOException {
+
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ System.out.println(path.toString());
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+
+ int idx = keyStr.indexOf(",");
+ if (idx != -1) {
+ model.loadFeatureWeight(keyStr.substring(0, idx), keyStr.substring(idx + 1), value.get());
+ }
+
+ }
+ }
+
+ return model;
+ }
+
+ public Model loadFeatureWeights(Model model, FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ System.out.println(path.toString());
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+
+ if (keyStr.startsWith(",")) { // Sum of weights for a Feature
+ model.setSumFeatureWeight(keyStr.substring(1),
+ value.get());
+ }
+ }
+ }
+ return model;
+ }
+
+ public Model loadLabelWeights(Model model,FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ System.out.println(path.toString());
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+
+ if (keyStr.startsWith("_")) { // Sum of weights in a Label
+ model.setSumLabelWeight(keyStr.substring(1), value
+ .get());
+ }
+ }
+ }
+
+ return model;
+ }
+
+ public Model loadThetaNormalizer(Model model,FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ System.out.println(path.toString());
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+ if (keyStr.startsWith("_")) { // Sum of weights in a Label
+ model.setThetaNormalizer(keyStr.substring(1), value
+ .get());
+ }
+ }
+ }
+
+ return model;
+ }
+
+ public Model loadSumWeight(Model model, FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ System.out.println(path.toString());
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+
+ if (keyStr.startsWith("*")) { // Sum of weights for all Feature
+ // and all Labels
+ model.setSigma_jSigma_k(value.get());
+ System.out.println(value.get());
+ }
+ }
+ }
+ return model;
+ }
+
+ public void createMapFile(FileSystem fs, Path pathPattern, Configuration conf)
+ throws IOException {
+
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+ MapFile.Writer writer = new MapFile.Writer(conf, fs, "data.mapfile",
+ Text.class, FloatWritable.class);
+ MapFile.Writer.setIndexInterval(conf, 3);
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ System.out.println(path.toString());
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+
+ if (keyStr.startsWith("_")) {
+
+ } else if (keyStr.startsWith(",")) {
+
+ } else if (keyStr.startsWith("*")) {
+
+ } else {
+ int idx = keyStr.indexOf(",");
+ if (idx != -1) {
+ HashMap<String, Float> data = new HashMap<String, Float>();
+ data.put(keyStr.substring(0, idx), new Float(value.get()));
+ writer.append(new Text(key.toString()), value);
+ }
+ }
+ }
+ }
+ writer.close();
+ // return model;
+ }
+
+ public HashMap<String, Float> readLabelSums(FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+ HashMap<String, Float> labelSum = new HashMap<String, Float>();
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+ if (keyStr.startsWith("_")) { // Sum of weights of labels
+ labelSum.put(keyStr.substring(1), new Float(value.get()));
+ }
+
+ }
+ }
+
+ return labelSum;
+ }
+
+ public HashMap<String, Float> readLabelDocumentCounts(FileSystem fs,
+ Path pathPattern, Configuration conf) throws IOException {
+ HashMap<String, Float> labelDocumentCounts = new HashMap<String, Float>();
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+ // the key is either _label_ or label,feature
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+ if (keyStr.startsWith("_")) { // Count of Documents in a Label
+ labelDocumentCounts.put(keyStr.substring(1), new Float(value.get()));
+ }
+
+ }
+ }
+
+ return labelDocumentCounts;
+ }
+
+ public Float readSigma_jSigma_k(FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+ HashMap<String, Float> weightSum = new HashMap<String, Float>();
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+ // the key is *
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+ if (weightSum.size() > 1) {
+ throw new IOException("Incorrect Sum File");
+ } else if (keyStr.startsWith("*")) {
+ weightSum.put(keyStr, new Float(value.get()));
+ }
+
+ }
+ }
+
+ Float sigma_jSigma_k = weightSum.get("*");
+ return sigma_jSigma_k;
+ }
+
+ public Float readVocabCount(FileSystem fs, Path pathPattern,
+ Configuration conf) throws IOException {
+ HashMap<String, Float> weightSum = new HashMap<String, Float>();
+ Writable key = new Text();
+ FloatWritable value = new FloatWritable();
+
+ FileStatus[] outputFiles = fs.globStatus(pathPattern);
+ for (FileStatus fileStatus : outputFiles) {
+ Path path = fileStatus.getPath();
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
+ // the key is *
+ while (reader.next(key, value)) {
+ String keyStr = key.toString();
+ if (weightSum.size() > 1) {
+ throw new IOException("Incorrect vocabCount File");
+ }
+ if (keyStr.startsWith("*")) {
+ weightSum.put(keyStr, new Float(value.get()));
+ }
+
+ }
+ }
+
+ Float sigma_jSigma_k = weightSum.get("*vocabCount");
+ return sigma_jSigma_k;
+ }
+
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/io/SequenceFileModelReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html Tue Aug 19 05:55:45 2008
@@ -0,0 +1,89 @@
+<!--
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+-->
+<HEAD>
+ <TITLE>org.apache.mahout.classifier.bayes</TITLE>
+</HEAD>
+<BODY>
+<DIV><h2>Introduction</h2>
+ The bayes package provides an implementation of a Map Reduce enabled naive bayes classifier. The naive bayes
+ classifier is a very simple classifier that counts the occurrences of words in association with a label which
+ can then be used to determine the likelihood that a new document, and it's words, should be assigned a particular
+ label.
+</DIV>
+<div><h2>Implementation</h2>
+
+ <p>The implementation is divided up into three parts:
+ <ol>
+ <li>The Trainer -- responsible for doing the counting of the words and the labels</li>
+ <li>The Model -- responsible for holding the training data in a useful way</li>
+ <li>The Classifier -- responsible for using the trainers output to determine the category of previously unseen
+ documents
+ </li>
+ </ol>
+ </p>
+ <div><h3>The Trainer</h3>
+
+ <p>The trainer is manifested in several classes:
+ <ol>
+ <li>{@link org.apache.mahout.classifier.bayes.BayesDriver} -- Creates the Hadoop Naive Bayes job and outputs
+ the model. This Driver encapsulates a lot of intermediate Map-Reduce Classes
+ </li>
+ <li>{@link org.apache.mahout.classifier.bayes.common.BayesFeatureDriver}
+ </li>
+ <li>{@link org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver}
+ </li>
+ <li>{@link org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver}
+ </li>
+ <li>{@link org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver}
+ </li>
+ </ol>
+ The trainer assumes that the input files are in the {@link org.apache.hadoop.mapred.KeyValueTextInputFormat}, i.e.
+ the first token of the line
+ is the label and separated from the remaining tokens on the line by a tab-delimiter. The remaining tokens are the unique features (words). Thus, input documents might look
+ like:
+ <pre>
+ hockey puck stick goalie forward defenseman referee ice checking slapshot helmet
+ football field football pigskin referee helmet turf tackle
+ </pre>
+ where hockey and football are the labels and the remaining words are the features associated with those particular
+ labels.</p>
+ <p>The output from the trainer is a {@link org.apache.hadoop.io.SequenceFile}.</p>
+ </div>
+ <div><h3>The Model</h3>
+ <p>The {@link org.apache.mahout.classifier.bayes.BayesModel} is the data structure used to represent the results of the training
+ for use by the {@link org.apache.mahout.classifier.bayes.BayesClassifier}. A Model can be created by hand, or, if using
+ the {@link org.apache.mahout.classifier.bayes.BayesDriver}, it can be created from the {@link
+ org.apache.hadoop.io.SequenceFile} that is output. To create it from the SequenceFile, use the
+ {@link org.apache.mahout.classifier.bayes.io.SequenceFileModelReader} located in the io subpackage.</p>
+ </div>
+ <div><h3>The Classifier</h3>
+
+ <p>The {@link org.apache.mahout.classifier.bayes.BayesClassifier} is responsible for using a {@link
+ org.apache.mahout.classifier.bayes.BayesModel} to classify
+ documents into categories.</p>
+ </div>
+</div>
+
+<DIV> </DIV>
+<DIV align="center">
+ Copyright © 2008 <A HREF="http://www.apache.org">Apache Software Foundation</A>
+</DIV>
+</BODY>
+</HTML>
\ No newline at end of file
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,134 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.util.PriorityQueue;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.common.Classifier;
+import org.apache.mahout.common.Model;
+
+import java.util.Collection;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.LinkedList;
+
+/**
+ * Classifies documents based on a {@link CBayesModel}.
+ */
+public class CBayesClassifier implements Classifier{
+
+ /**
+ * Classify the document and return the top <code>numResults</code>
+ *
+ * @param model The model
+ * @param document The document to classify
+ * @param defaultCategory The default category to assign
+ * @param numResults The maximum number of results to return, ranked by score. Ties are broken by comparing the category
+ * @return A Collection of {@link org.apache.mahout.classifier.ClassifierResult}s.
+ */
+ public Collection<ClassifierResult> classify(Model model, String[] document, String defaultCategory, int numResults) {
+ Collection<String> categories = model.getLabels();
+ PriorityQueue pq = new ClassifierResultPriorityQueue(numResults);
+ ClassifierResult tmp = null;
+ for (String category : categories){
+ float prob = documentProbability(model, category, document);
+ if (prob < 0) {
+ tmp = new ClassifierResult(category, prob);
+ pq.insert(tmp);
+ }
+ }
+
+ LinkedList<ClassifierResult> result = new LinkedList<ClassifierResult>();
+ while ((tmp = (ClassifierResult) pq.pop()) != null) {
+ result.addLast(tmp);
+ }
+ if (result.isEmpty()){
+ result.add(new ClassifierResult(defaultCategory, 0));
+ }
+ return result;
+ }
+
+ /**
+ * Classify the document according to the {@link org.apache.mahout.common.Model}
+ *
+ * @param model The trained {@link org.apache.mahout.common.Model}
+ * @param document The document to classify
+ * @param defaultCategory The default category to assign if one cannot be determined
+ * @return The single best category
+ */
+ public ClassifierResult classify(Model model, String[] document, String defaultCategory) {
+ ClassifierResult result = new ClassifierResult(defaultCategory);
+ float min = 0.0f;
+ Collection<String> categories = model.getLabels();
+
+ for (String category : categories) {
+ float prob = documentProbability(model, category, document);
+ if (prob < min) {
+ min = prob;
+ result.setLabel(category);
+ }
+ }
+ result.setScore(min);
+ return result;
+ }
+
+ /**
+ * Calculate the document probability as the multiplication of the {@link org.apache.mahout.common.Model#FeatureWeight(String, String)} for each word given the label
+ *
+ * @param model The {@link org.apache.mahout.common.Model}
+ * @param label The label to calculate the probability of
+ * @param document The document
+ * @return The probability
+ * @see Model#FeatureWeight(String, String)
+ */
+ public float documentProbability(Model model, String label, String[] document) {
+ float result = 0.0f;
+ Hashtable<String, Integer> wordList = new Hashtable<String, Integer>(1000);
+ for (String word : document) {
+ if (wordList.containsKey(word)) {
+ Integer count = wordList.get(word);
+ count++;
+ wordList.put(word, count);
+ } else {
+ wordList.put(word, 1);
+ }
+ }
+ for (Enumeration<String> e = wordList.keys(); e.hasMoreElements();) {
+ String word = e.nextElement();
+ Integer count = wordList.get(word);
+ result += count * model.FeatureWeight(label, word);
+ }
+ return result;
+ }
+
+
+ private static class ClassifierResultPriorityQueue extends PriorityQueue {
+
+ private ClassifierResultPriorityQueue(int numResults) {
+ initialize(numResults);
+ }
+
+ protected boolean lessThan(Object a, Object b) {
+ ClassifierResult cr1 = (ClassifierResult) a;
+ ClassifierResult cr2 = (ClassifierResult) b;
+
+ float score1 = cr1.getScore();
+ float score2 = cr2.getScore();
+ return score1 == score2 ? cr1.getLabel().compareTo(cr2.getLabel()) < 0 : score1 < score2;
+ }
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesClassifier.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,126 @@
+package org.apache.mahout.classifier.cbayes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.common.BayesFeatureDriver;
+import org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver;
+import org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver;
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ */
+public class CBayesDriver {
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents
+ * live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the
+ * {@link org.apache.mahout.common.Model} as a
+ * {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ *
+ * @param args The args
+ */
+ public static void main(String[] args) {
+ String input = args[0];
+ String output = args[1];
+
+ runJob(input, output, 1);
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input the input pathname String
+ * @param output the output pathname String
+ *
+ */
+ @SuppressWarnings("deprecation")
+ public static void runJob(String input, String output, int gramSize) {
+ JobConf conf = new JobConf(CBayesDriver.class);
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ Path outPath = new Path(output);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath);
+
+ System.out.println("Reading features...");
+ //Read the features in each document normalized by length of each document
+ BayesFeatureDriver.runJob(input, output, gramSize);
+
+ System.out.println("Calculating Tf-Idf...");
+ //Calculate the TfIdf for each word in each label
+ BayesTfIdfDriver.runJob(input, output);
+
+ System.out.println("Calculating weight sums for labels and features...");
+ //Calculate the Sums of weights for each label, for each feature and for each feature and for each label
+ BayesWeightSummerDriver.runJob(input, output);
+
+ //System.out.println("Calculating the weight of the features of each label in the complement class...");
+ //Calculate the W_ij = log(Theta) for each label, feature. This step actually generates the complement class
+ //CBayesThetaDriver.runJob(input, output);
+
+ System.out.println("Calculating the weight Normalisation factor for each complement class...");
+ //Calculate the normalization factor Sigma_W_ij for each complement class.
+ CBayesThetaNormalizerDriver.runJob(input, output);
+
+ //System.out.println("Calculating the final Weight Normalized Complementary Naive Bayes Model...");
+ //Calculate the normalization factor Sigma_W_ij for each complement class.
+ //CBayesNormalizedWeightDriver.runJob(input, output);
+
+ Path docCountOutPath = new Path(output+ "/trainer-docCount");
+ if (dfs.exists(docCountOutPath))
+ dfs.delete(docCountOutPath, true);
+ Path termDocCountOutPath = new Path(output+ "/trainer-termDocCount");
+ if (dfs.exists(termDocCountOutPath))
+ dfs.delete(termDocCountOutPath, true);
+ Path featureCountOutPath = new Path(output+ "/trainer-featureCount");
+ if (dfs.exists(featureCountOutPath))
+ dfs.delete(featureCountOutPath, true);
+ Path wordFreqOutPath = new Path(output+ "/trainer-wordFreq");
+ if (dfs.exists(wordFreqOutPath))
+ dfs.delete(wordFreqOutPath, true);
+ Path vocabCountPath = new Path(output+ "/trainer-tfIdf/trainer-vocabCount");
+ if (dfs.exists(vocabCountPath))
+ dfs.delete(vocabCountPath, true);
+ /*Path tfIdfOutPath = new Path(output+ "/trainer-tfIdf");
+ if (dfs.exists(tfIdfOutPath))
+ dfs.delete(tfIdfOutPath, true);*/
+ Path vocabCountOutPath = new Path(output+ "/trainer-vocabCount");
+ if (dfs.exists(vocabCountOutPath))
+ dfs.delete(vocabCountOutPath, true);
+ /* Path weightsOutPath = new Path(output+ "/trainer-weights");
+ if (dfs.exists(weightsOutPath))
+ dfs.delete(weightsOutPath, true);*/
+ /*Path thetaOutPath = new Path(output+ "/trainer-theta");
+ if (dfs.exists(thetaOutPath))
+ dfs.delete(thetaOutPath, true);*/
+ /*Path thetaNormalizerOutPath = new Path(output+ "/trainer-thetaNormalizer");
+ if (dfs.exists(thetaNormalizerOutPath))
+ dfs.delete(thetaNormalizerOutPath, true);*/
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesDriver.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,194 @@
+package org.apache.mahout.classifier.cbayes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.mahout.common.Model;
+
+import java.util.Map;
+
+
+/**
+ *
+ *
+ */
+public class CBayesModel extends Model {
+
+ @Override
+ protected float getWeight(Integer label, Integer feature) {
+ float result = 0.0f;
+ Map<Integer, Float> featureWeights = featureLabelWeights.get(feature);
+
+ if (featureWeights.containsKey(label)) {
+ result = featureWeights.get(label).floatValue();
+ }
+ float vocabCount = featureList.size();
+ float sumLabelWeight = getSumLabelWeight(label);
+ float sigma_j = getSumFeatureWeight(feature);
+
+ float numerator = sigma_j - result + alpha_i;
+ float denominator =(sigma_jSigma_k - sumLabelWeight + vocabCount);
+
+ float weight = new Double(Math.log(numerator /denominator)).floatValue();
+ result = (-1.0f * (weight / getThetaNormalizer(label)));
+ return result;
+ }
+
+ @Override
+ protected float getWeightUnprocessed(Integer label, Integer feature) {
+ float result = 0.0f;
+ Map<Integer, Float> featureWeights = featureLabelWeights.get(feature);
+
+ if (featureWeights.containsKey(label)) {
+ result = featureWeights.get(label).floatValue();
+ } else {
+ result = 0;
+ }
+ return result;
+ }
+
+ @Override
+ public void InitializeNormalizer() {
+ float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+
+
+ System.out.println(thetaNormalizer);
+ for (Integer label : thetaNormalizer.keySet()) {
+ float Sigma_W_ij = thetaNormalizer.get(label);
+ if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
+ perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+ }
+ }
+
+ for (Integer label : thetaNormalizer.keySet()) {
+ float Sigma_W_ij = thetaNormalizer.get(label);
+ thetaNormalizer.put(label, Sigma_W_ij
+ / perLabelWeightSumNormalisationFactor);
+ }
+ System.out.println(thetaNormalizer);
+
+ /*for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+ thetaNormalizer.put(label, new Float(0));
+ }
+ for (int feature = 0, maxFeatures = featureList.size(); feature < maxFeatures; feature++) {
+ for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+
+ float D_ij = getWeightUnprocessed(label, feature);
+ float sumLabelWeight = getSumLabelWeight(label);
+ float sigma_j = getSumFeatureWeight(feature);
+ float vocabCount = featureList.size();
+
+ float numerator = (sigma_j ) + alpha_i;
+ float denominator = (sigma_jSigma_k - sumLabelWeight + vocabCount);
+ float denominator1 = 0.5f *(sigma_jSigma_k/vocabCount + D_ij * (float)maxLabels);
+ Float weight = (float) Math.log(numerator / denominator) + (float) Math.log( 1 - D_ij/denominator1 );
+
+ thetaNormalizer.put(label, weight+thetaNormalizer.get(label));
+
+ }
+ }
+ perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+ System.out.println(thetaNormalizer);
+ for (Integer label : thetaNormalizer.keySet()) {
+ float Sigma_W_ij = thetaNormalizer.get(label);
+ if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
+ perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+ }
+ }
+
+ for (Integer label : thetaNormalizer.keySet()) {
+ float Sigma_W_ij = thetaNormalizer.get(label);
+ thetaNormalizer.put(label, Sigma_W_ij
+ / perLabelWeightSumNormalisationFactor);
+ }
+ System.out.println(thetaNormalizer);*/
+ }
+
+ @Override
+ public void GenerateModel() {
+ try {
+ float vocabCount = featureList.size();
+
+ float[] perLabelThetaNormalizer = new float[labelList.size()];
+
+ float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+
+ for (int feature = 0, maxFeatures = featureList.size(); feature < maxFeatures; feature++) {
+ for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+
+ float D_ij = getWeightUnprocessed(label, feature);
+ float sumLabelWeight = getSumLabelWeight(label);
+ float sigma_j = getSumFeatureWeight(feature);
+
+ float numerator = (sigma_j - D_ij) + alpha_i;
+ float denominator = (sigma_jSigma_k - sumLabelWeight) + vocabCount;
+
+ Float weight = (float) Math.log(numerator / denominator);
+
+ if (D_ij != 0)
+ setWeight(label, feature, weight);
+
+ perLabelThetaNormalizer[label] += weight;
+
+ }
+ }
+ System.out.println("Normalizing Weights");
+ for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+ float Sigma_W_ij = perLabelThetaNormalizer[label];
+ if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) {
+ perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+ }
+ }
+
+ for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+ float Sigma_W_ij = perLabelThetaNormalizer[label];
+ perLabelThetaNormalizer[label] = Sigma_W_ij
+ / perLabelWeightSumNormalisationFactor;
+ }
+
+ for (int feature = 0, maxFeatures = featureList.size(); feature < maxFeatures; feature++) {
+ for (int label = 0, maxLabels = labelList.size(); label < maxLabels; label++) {
+ float W_ij = getWeightUnprocessed(label, feature);
+ if (W_ij == 0)
+ continue;
+ float Sigma_W_ij = perLabelThetaNormalizer[label];
+ float normalizedWeight = -1.0f * (W_ij / Sigma_W_ij);
+ setWeight(label, feature, normalizedWeight);
+ }
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+
+ /**
+ * Get the weighted probability of the feature.
+ *
+ * @param label The label of the feature
+ * @param feature The feature to calc. the prob. for
+ * @return The weighted probability
+ */
+ @Override
+ public float FeatureWeight(Integer label, Integer feature) {
+ float weight = getWeight(label, feature);
+ return weight;
+ }
+
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesModel.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,122 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ **/
+public class CBayesNormalizedWeightDriver {
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the {@link org.apache.mahout.common.Model} as a {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ * @param args The args
+ */
+ public static void main(String[] args) {
+ String input = args[0];
+ String output = args[1];
+
+ runJob(input, output);
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input the input pathname String
+ * @param output the output pathname String
+
+ */
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class);
+
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(FloatWritable.class);
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-theta"));
+ Path outPath = new Path(output + "/trainer-weight");
+ SequenceFileOutputFormat.setOutputPath(conf, outPath);
+ conf.setNumMapTasks(100);
+ //conf.setNumReduceTasks(1);
+ conf.setMapperClass(CBayesNormalizedWeightMapper.class);
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ conf.setCombinerClass(CBayesNormalizedWeightReducer.class);
+ conf.setReducerClass(CBayesNormalizedWeightReducer.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+
+ SequenceFileModelReader reader = new SequenceFileModelReader();
+
+ Path thetaNormalizationsFiles = new Path(output+"/trainer-thetaNormalizer/part*");
+ HashMap<String,Float> thetaNormalizer= reader.readLabelSums(dfs, thetaNormalizationsFiles, conf);
+ float perLabelWeightSumNormalisationFactor = Float.MAX_VALUE;
+ for(String label: thetaNormalizer.keySet())
+ {
+
+ float Sigma_W_ij = thetaNormalizer.get(label);
+ if(perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)){
+ perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij);
+ }
+ }
+
+ for(String label: thetaNormalizer.keySet())
+ {
+ float Sigma_W_ij = thetaNormalizer.get(label);
+ thetaNormalizer.put(label, Sigma_W_ij / perLabelWeightSumNormalisationFactor) ;
+ }
+
+
+ DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(thetaNormalizer));
+ String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);
+
+ HashMap<String,Float> c = mapStringifier.fromString(thetaNormalizationsString);
+ System.out.println(c);
+ conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString);
+
+
+ client.setConf(conf);
+
+ JobClient.runJob(conf);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightDriver.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,85 @@
+package org.apache.mahout.classifier.cbayes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.GenericsUtil;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ *
+ *
+ */
+public class CBayesNormalizedWeightMapper extends MapReduceBase implements
+ Mapper<Text, FloatWritable, Text, FloatWritable> {
+
+ public HashMap<String, Float> thetaNormalizer = null;
+
+ String thetaNormalizationsString = " ";
+
+ /**
+ * We need to calculate the idf of each feature in each label
+ *
+ * @param key The label,feature pair (can either be the freq Count or the term
+ * Document count
+ * @param value
+ * @param output
+ * @param reporter
+ * @throws IOException
+ */
+ public void map(Text key, FloatWritable value,
+ OutputCollector<Text, FloatWritable> output, Reporter reporter)
+ throws IOException {
+
+ String labelFeaturePair = key.toString();
+
+ String label = labelFeaturePair.split(",")[0];
+ output.collect(key, new FloatWritable((float)(-1.0f * (float)Math.log(value.get())/thetaNormalizer.get(label))));// output -D_ij
+
+ }
+
+ @Override
+ public void configure(JobConf job) {
+ try {
+ if (thetaNormalizer == null) {
+ thetaNormalizer = new HashMap<String, Float>();
+
+ DefaultStringifier<HashMap<String, Float>> mapStringifier = new DefaultStringifier<HashMap<String, Float>>(
+ job, GenericsUtil.getClass(thetaNormalizer));
+
+ thetaNormalizationsString = mapStringifier.toString(thetaNormalizer);
+ thetaNormalizationsString = job.get("cnaivebayes.thetaNormalizations",
+ thetaNormalizationsString);
+ thetaNormalizer = mapStringifier.fromString(thetaNormalizationsString);
+
+ }
+ } catch (IOException ex) {
+
+ ex.printStackTrace();
+ }
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightMapper.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,53 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+
+/**
+ * Can also be used as a local Combiner beacuse only two values should be there inside the values
+ *
+ **/
+
+public class CBayesNormalizedWeightReducer extends MapReduceBase implements Reducer<Text, FloatWritable, Text, FloatWritable> {
+
+
+
+ public void reduce(Text key, Iterator<FloatWritable> values, OutputCollector<Text, FloatWritable> output, Reporter reporter) throws IOException {
+ //Key is label,word, value is the number of times we've seen this label word per local node. Output is the same
+ String token = key.toString();
+ float weight = 0.0f;
+ while (values.hasNext()) {
+ weight += values.next().get();
+ }
+ if(token.equalsIgnoreCase(new String("rec.motorcycles,miller")))
+ System.out.println(token + "=>" + weight);
+ output.collect(key, new FloatWritable(weight));
+ }
+
+
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesNormalizedWeightReducer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java?rev=687042&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java Tue Aug 19 05:55:45 2008
@@ -0,0 +1,127 @@
+package org.apache.mahout.classifier.cbayes;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericsUtil;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.mahout.classifier.bayes.io.SequenceFileModelReader;
+
+
+/**
+ * Create and run the Bayes Trainer.
+ *
+ **/
+public class CBayesThetaDriver {
+ /**
+ * Takes in two arguments:
+ * <ol>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The output {@link org.apache.hadoop.fs.Path} where to write the {@link org.apache.mahout.common.Model} as a {@link org.apache.hadoop.io.SequenceFile}</li>
+ * </ol>
+ * @param args The args
+ */
+ public static void main(String[] args) {
+ String input = args[0];
+ String output = args[1];
+
+ runJob(input, output);
+ }
+
+ /**
+ * Run the job
+ *
+ * @param input the input pathname String
+ * @param output the output pathname String
+
+ */
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(CBayesThetaDriver.class);
+
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(FloatWritable.class);
+
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-weights/Sigma_j"));
+ SequenceFileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf"));
+ Path outPath = new Path(output + "/trainer-theta");
+ SequenceFileOutputFormat.setOutputPath(conf, outPath);
+ //conf.setNumMapTasks(1);
+ //conf.setNumReduceTasks(1);
+ conf.setMapperClass(CBayesThetaMapper.class);
+ conf.setInputFormat(SequenceFileInputFormat.class);
+ //conf.setCombinerClass(CBayesThetaReducer.class);
+ conf.setReducerClass(CBayesThetaReducer.class);
+ conf.setOutputFormat(SequenceFileOutputFormat.class);
+
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+
+ SequenceFileModelReader reader = new SequenceFileModelReader();
+
+ Path Sigma_kFiles = new Path(output+"/trainer-weights/Sigma_k/*");
+ HashMap<String,Float> labelWeightSum= reader.readLabelSums(dfs, Sigma_kFiles, conf);
+ DefaultStringifier<HashMap<String,Float>> mapStringifier = new DefaultStringifier<HashMap<String,Float>>(conf, GenericsUtil.getClass(labelWeightSum));
+ String labelWeightSumString = mapStringifier.toString(labelWeightSum);
+
+ System.out.println("Sigma_k for Each Label");
+ HashMap<String,Float> c = mapStringifier.fromString(labelWeightSumString);
+ System.out.println(c);
+ conf.set("cnaivebayes.sigma_k", labelWeightSumString);
+
+
+ Path sigma_kSigma_jFile = new Path(output+"/trainer-weights/Sigma_kSigma_j/*");
+ Float sigma_jSigma_k = reader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf);
+ DefaultStringifier<Float> floatStringifier = new DefaultStringifier<Float>(conf, Float.class);
+ String sigma_jSigma_kString = floatStringifier.toString(sigma_jSigma_k);
+
+ System.out.println("Sigma_kSigma_j for each Label and for each Features");
+ Float retSigma_jSigma_k = floatStringifier.fromString(sigma_jSigma_kString);
+ System.out.println(retSigma_jSigma_k);
+ conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString);
+
+ Path vocabCountFile = new Path(output+"/trainer-tfIdf/trainer-vocabCount/*");
+ Float vocabCount = reader.readVocabCount(dfs, vocabCountFile, conf);
+ String vocabCountString = floatStringifier.toString(vocabCount);
+
+ System.out.println("Vocabulary Count");
+ conf.set("cnaivebayes.vocabCount", vocabCountString);
+ Float retvocabCount = floatStringifier.fromString(vocabCountString);
+ System.out.println(retvocabCount);
+
+ client.setConf(conf);
+
+ JobClient.runJob(conf);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+}
Propchange: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/cbayes/CBayesThetaDriver.java
------------------------------------------------------------------------------
svn:eol-style = native