You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/15 19:27:32 UTC
svn commit: r910285 - in /lucene/mahout/trunk/core/src:
main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/
test/java/org/apache/mahout/classifier/
test/java/org/apache/mahout/classifier/bayes/
Author: robinanil
Date: Mon Feb 15 18:27:31 2010
New Revision: 910285
URL: http://svn.apache.org/viewvc?rev=910285&view=rev
Log:
MAHOUT-293 Classifier TestData and BayesClassifier Self Test
Added:
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java?rev=910285&r1=910284&r2=910285&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java Mon Feb 15 18:27:31 2010
@@ -45,7 +45,7 @@
private static final Logger log = LoggerFactory.getLogger(BayesClassifierDriver.class);
- private BayesClassifierDriver() { }
+ private BayesClassifierDriver() {}
/**
* Run the job
@@ -87,10 +87,10 @@
log.info("{}", matrix.summarize());
}
- private static ConfusionMatrix readResult(FileSystem fs,
- Path pathPattern,
- Configuration conf,
- Parameters params) throws IOException {
+ public static ConfusionMatrix readResult(FileSystem fs,
+ Path pathPattern,
+ Configuration conf,
+ Parameters params) throws IOException {
StringTuple key = new StringTuple();
DoubleWritable value = new DoubleWritable();
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java?rev=910285&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java Mon Feb 15 18:27:31 2010
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+
+/**
+ * Class containing sample docs from ASF websites under mahout, lucene and spamassasin projects
+ *
+ */
+public final class ClassifierData {
+
+ public static final String[][] DATA = {
+ {
+ "mahout",
+ "Mahout's goal is to build scalable machine learning libraries. With scalable we mean: "
+ + "Scalable to reasonably large data sets. Our core algorithms for clustering,"
+ + " classfication and batch based collaborative filtering are implemented on top "
+ + "of Apache Hadoop using the map/reduce paradigm. However we do not restrict "
+ + "contributions to Hadoop based implementations: Contributions that run on"},
+ {
+ "mahout",
+ " a single node or on a non-Hadoop cluster are welcome as well. The core"
+ + " libraries are highly optimized to allow for good performance also for"
+ + " non-distributed algorithms. Scalable to support your business case. "
+ + "Mahout is distributed under a commercially friendly Apache Software license. "
+ + "Scalable community. The goal of Mahout is to build a vibrant, responsive, "},
+ {
+ "mahout",
+ "diverse community to facilitate discussions not only on the project itself"
+ + " but also on potential use cases. Come to the mailing lists to find out more."
+ + " Currently Mahout supports mainly four use cases: Recommendation mining takes "
+ + "users' behavior and from that tries to find items users might like. Clustering "},
+ {
+ "mahout",
+ "takes e.g. text documents and groups them into groups of topically related documents."
+ + " Classification learns from exisiting categorized documents what documents of"
+ + " a specific category look like and is able to assign unlabelled documents to "
+ + "the (hopefully) correct category. Frequent itemset mining takes a set of item"
+ + " groups (terms in a query session, shopping cart content) and identifies, which"
+ + " individual items usually appear together."},
+ {
+ "lucene",
+ "Apache Lucene is a high-performance, full-featured text search engine library"
+ + " written entirely in Java. It is a technology suitable for nearly any application "
+ + "that requires full-text search, especially cross-platform. Apache Lucene is an open source"
+ + " project available for free download. Please use the links on the left to access Lucene. "
+ + "The new version is mostly a cleanup release without any new features. "},
+ {
+ "lucene",
+ "All deprecations targeted to be removed in version 3.0 were removed. If you "
+ + "are upgrading from version 2.9.1 of Lucene, you have to fix all deprecation warnings"
+ + " in your code base to be able to recompile against this version. This is the first Lucene"},
+ {
+ "lucene",
+ " release with Java 5 as a minimum requirement. The API was cleaned up to make use of Java 5's "
+ + "generics, varargs, enums, and autoboxing. New users of Lucene are advised to use this version "
+ + "for new developments, because it has a clean, type safe new API. Upgrading users can now remove"},
+ {
+ "lucene",
+ " unnecessary casts and add generics to their code, too. If you have not upgraded your installation "
+ + "to Java 5, please read the file JRE_VERSION_MIGRATION.txt (please note that this is not related to"
+ + " Lucene 3.0, it will also happen with any previous release when you upgrade your Java environment)."},
+ {
+ "spamassasin",
+ "SpamAssassin is a mail filter to identify spam. It is an intelligent email filter which uses a diverse "
+ + "range of tests to identify unsolicited bulk email, more commonly known as Spam. These tests are applied "
+ + "to email headers and content to classify email using advanced statistical methods. In addition, "},
+ {
+ "spamassasin",
+ "SpamAssassin has a modular architecture that allows other technologies to be quickly wielded against spam"
+ + " and is designed for easy integration into virtually any email system."
+ + "SpamAssassin's practical multi-technique approach, modularity, and extensibility continue to give it an "},
+ {
+ "spamassasin",
+ "advantage over other anti-spam systems. Due to these advantages, SpamAssassin is widely used in all aspects "
+ + "of email management. You can readily find SpamAssassin in use in both email clients and servers, on many "
+ + "different operating systems, filtering incoming as well as outgoing email, and implementing a "
+ + "very broad range "},
+ {
+ "spamassasin",
+ "of policy actions. These installations include service providers, businesses, not-for-profit and "
+ + "educational organizations, and end-user systems. SpamAssassin also forms the basis for numerous "
+ + "commercial anti-spam products available on the market today."}};
+
+
+ private ClassifierData() { }
+
+ public static void writeDataToFile(String file, String[][] content) throws IOException {
+ BufferedWriter writer = new BufferedWriter(new FileWriter(file));
+ for (String[] entry : content) {
+ writer.write(entry[0] + "\t" + entry[1] + "\n");
+ }
+ writer.close();
+ }
+
+ public static void writeDataToSequenceFile(String file,
+ String[][] content,
+ FileSystem fs,
+ Configuration conf) throws IOException {
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(file), Text.class, Text.class);
+ for (String[] entry : content) {
+ writer.append(new Text(entry[0]), new Text(entry[1]));
+ }
+ writer.close();
+ }
+}
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java?rev=910285&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java Mon Feb 15 18:27:31 2010
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.bayes;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.ClassifierData;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.classifier.bayes.algorithm.BayesAlgorithm;
+import org.apache.mahout.classifier.bayes.algorithm.CBayesAlgorithm;
+import org.apache.mahout.classifier.bayes.common.BayesParameters;
+import org.apache.mahout.classifier.bayes.datastore.InMemoryBayesDatastore;
+import org.apache.mahout.classifier.bayes.exceptions.InvalidDatastoreException;
+import org.apache.mahout.classifier.bayes.interfaces.Algorithm;
+import org.apache.mahout.classifier.bayes.interfaces.Datastore;
+import org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver;
+import org.apache.mahout.classifier.bayes.model.ClassifierContext;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.nlp.NGrams;
+
+public class BayesClassifierSelfTest extends MahoutTestCase {
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ ClassifierData.writeDataToFile("testdata/bayesinput", ClassifierData.DATA);
+ }
+
+ public void testSelfTestBayes() throws InvalidDatastoreException, IOException {
+ BayesParameters params = new BayesParameters(1);
+ params.set("alpha_i", "1.0");
+ params.set("dataSource", "hdfs");
+ TrainClassifier.trainNaiveBayes("testdata/bayesinput", "testdata/bayesmodel", params);
+
+ params.set("verbose", "true");
+ params.set("basePath", "testdata/bayesmodel");
+ params.set("classifierType", "bayes");
+ params.set("dataSource", "hdfs");
+ params.set("defaultCat", "unknown");
+ params.set("encoding", "UTF-8");
+ params.set("alpha_i", "1.0");
+
+ Algorithm algorithm = new BayesAlgorithm();
+ Datastore datastore = new InMemoryBayesDatastore(params);
+ ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
+ classifier.initialize();
+ ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat"));
+
+ for (String[] entry : ClassifierData.DATA) {
+ List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize")))
+ .generateNGramsWithoutLabel();
+ assertEquals(3, classifier.classifyDocument(document.toArray(new String[] {}),
+ params.get("defaultCat"), 100).length);
+ ClassifierResult result = classifier.classifyDocument(document.toArray(new String[] {}), params
+ .get("defaultCat"));
+ assertEquals(entry[0], result.getLabel());
+ resultAnalyzer.addInstance(entry[0], result);
+ }
+ int[][] matrix = resultAnalyzer.getConfusionMatrix().getConfusionMatrix();
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ if (i == j) assertEquals(4, matrix[i][j]);
+ else assertEquals(0, matrix[i][j]);
+ }
+ }
+ params.set("testDirPath", "testdata/bayesinput");
+ TestClassifier.classifyParallel(params);
+ Configuration conf = new Configuration();
+ Path outputFiles = new Path("testdata/bayesinput-output/part*");
+ FileSystem fs = FileSystem.get(outputFiles.toUri(), conf);
+ matrix = BayesClassifierDriver.readResult(fs, outputFiles, conf, params).getConfusionMatrix();
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ if (i == j) assertEquals(4, matrix[i][j]);
+ else assertEquals(0, matrix[i][j]);
+ }
+ }
+ }
+
+ public void testSelfTestCBayes() throws InvalidDatastoreException, IOException {
+ BayesParameters params = new BayesParameters(1);
+ params.set("alpha_i", "1.0");
+ params.set("dataSource", "hdfs");
+ TrainClassifier.trainCNaiveBayes("testdata/bayesinput", "testdata/cbayesmodel", params);
+
+ params.set("verbose", "true");
+ params.set("basePath", "testdata/cbayesmodel");
+ params.set("classifierType", "cbayes");
+ params.set("dataSource", "hdfs");
+ params.set("defaultCat", "unknown");
+ params.set("encoding", "UTF-8");
+ params.set("alpha_i", "1.0");
+
+ Algorithm algorithm = new CBayesAlgorithm();
+ Datastore datastore = new InMemoryBayesDatastore(params);
+ ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
+ classifier.initialize();
+ ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat"));
+ for (String[] entry : ClassifierData.DATA) {
+ List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize")))
+ .generateNGramsWithoutLabel();
+ assertEquals(3, classifier.classifyDocument(document.toArray(new String[] {}),
+ params.get("defaultCat"), 100).length);
+ ClassifierResult result = classifier.classifyDocument(document.toArray(new String[] {}), params
+ .get("defaultCat"));
+ assertEquals(entry[0], result.getLabel());
+ resultAnalyzer.addInstance(entry[0], result);
+ }
+ int[][] matrix = resultAnalyzer.getConfusionMatrix().getConfusionMatrix();
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ if (i == j) assertEquals(4, matrix[i][j]);
+ else assertEquals(0, matrix[i][j]);
+ }
+ }
+ params.set("testDirPath", "testdata/bayesinput");
+ TestClassifier.classifyParallel(params);
+ Configuration conf = new Configuration();
+ Path outputFiles = new Path("testdata/bayesinput-output/part*");
+ FileSystem fs = FileSystem.get(outputFiles.toUri(), conf);
+ matrix = BayesClassifierDriver.readResult(fs, outputFiles, conf, params).getConfusionMatrix();
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 3; j++) {
+ if (i == j) assertEquals(4, matrix[i][j]);
+ else assertEquals(0, matrix[i][j]);
+ }
+ }
+ }
+
+}