You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/15 19:27:32 UTC

svn commit: r910285 - in /lucene/mahout/trunk/core/src: main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/ test/java/org/apache/mahout/classifier/ test/java/org/apache/mahout/classifier/bayes/

Author: robinanil
Date: Mon Feb 15 18:27:31 2010
New Revision: 910285

URL: http://svn.apache.org/viewvc?rev=910285&view=rev
Log:
MAHOUT-293 Classifier TestData and BayesClassifier Self Test

Added:
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java?rev=910285&r1=910284&r2=910285&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/bayes/BayesClassifierDriver.java Mon Feb 15 18:27:31 2010
@@ -45,7 +45,7 @@
   
   private static final Logger log = LoggerFactory.getLogger(BayesClassifierDriver.class);
   
-  private BayesClassifierDriver() { }
+  private BayesClassifierDriver() {}
   
   /**
    * Run the job
@@ -87,10 +87,10 @@
     log.info("{}", matrix.summarize());
   }
   
-  private static ConfusionMatrix readResult(FileSystem fs,
-                                            Path pathPattern,
-                                            Configuration conf,
-                                            Parameters params) throws IOException {
+  public static ConfusionMatrix readResult(FileSystem fs,
+                                           Path pathPattern,
+                                           Configuration conf,
+                                           Parameters params) throws IOException {
     
     StringTuple key = new StringTuple();
     DoubleWritable value = new DoubleWritable();

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java?rev=910285&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/ClassifierData.java Mon Feb 15 18:27:31 2010
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+
+/**
+ * Class containing sample docs from ASF websites under mahout, lucene and spamassasin projects
+ *
+ */
+public final class ClassifierData {
+  
+  public static final String[][] DATA = {
+    {
+      "mahout",
+      "Mahout's goal is to build scalable machine learning libraries. With scalable we mean: "
+         + "Scalable to reasonably large data sets. Our core algorithms for clustering,"
+         + " classfication and batch based collaborative filtering are implemented on top "
+         + "of Apache Hadoop using the map/reduce paradigm. However we do not restrict "
+         + "contributions to Hadoop based implementations: Contributions that run on"},
+    {
+      "mahout",
+      " a single node or on a non-Hadoop cluster are welcome as well. The core"
+         + " libraries are highly optimized to allow for good performance also for"
+         + " non-distributed algorithms. Scalable to support your business case. "
+         + "Mahout is distributed under a commercially friendly Apache Software license. "
+         + "Scalable community. The goal of Mahout is to build a vibrant, responsive, "},
+    {
+      "mahout",
+      "diverse community to facilitate discussions not only on the project itself"
+         + " but also on potential use cases. Come to the mailing lists to find out more."
+         + " Currently Mahout supports mainly four use cases: Recommendation mining takes "
+         + "users' behavior and from that tries to find items users might like. Clustering "},
+    {
+      "mahout",
+      "takes e.g. text documents and groups them into groups of topically related documents."
+         + " Classification learns from exisiting categorized documents what documents of"
+         + " a specific category look like and is able to assign unlabelled documents to "
+         + "the (hopefully) correct category. Frequent itemset mining takes a set of item"
+         + " groups (terms in a query session, shopping cart content) and identifies, which"
+         + " individual items usually appear together."},
+    {
+      "lucene",
+      "Apache Lucene is a high-performance, full-featured text search engine library"
+         + " written entirely in Java. It is a technology suitable for nearly any application "
+         + "that requires full-text search, especially cross-platform. Apache Lucene is an open source"
+         + " project available for free download. Please use the links on the left to access Lucene. "
+         + "The new version is mostly a cleanup release without any new features. "},
+    {
+      "lucene",
+      "All deprecations targeted to be removed in version 3.0 were removed. If you "
+         + "are upgrading from version 2.9.1 of Lucene, you have to fix all deprecation warnings"
+         + " in your code base to be able to recompile against this version. This is the first Lucene"},
+    {
+      "lucene",
+      " release with Java 5 as a minimum requirement. The API was cleaned up to make use of Java 5's "
+         + "generics, varargs, enums, and autoboxing. New users of Lucene are advised to use this version "
+         + "for new developments, because it has a clean, type safe new API. Upgrading users can now remove"},
+    {
+      "lucene",
+      " unnecessary casts and add generics to their code, too. If you have not upgraded your installation "
+         + "to Java 5, please read the file JRE_VERSION_MIGRATION.txt (please note that this is not related to"
+         + " Lucene 3.0, it will also happen with any previous release when you upgrade your Java environment)."},
+    {
+      "spamassasin",
+      "SpamAssassin is a mail filter to identify spam. It is an intelligent email filter which uses a diverse "
+         + "range of tests to identify unsolicited bulk email, more commonly known as Spam. These tests are applied "
+         + "to email headers and content to classify email using advanced statistical methods. In addition, "},
+    {
+      "spamassasin",
+      "SpamAssassin has a modular architecture that allows other technologies to be quickly wielded against spam"
+         + " and is designed for easy integration into virtually any email system."
+         + "SpamAssassin's practical multi-technique approach, modularity, and extensibility continue to give it an "},
+    {
+      "spamassasin",
+      "advantage over other anti-spam systems. Due to these advantages, SpamAssassin is widely used in all aspects "
+         + "of email management. You can readily find SpamAssassin in use in both email clients and servers, on many "
+         + "different operating systems, filtering incoming as well as outgoing email, and implementing a "
+         + "very broad range "},
+    {
+      "spamassasin",
+      "of policy actions. These installations include service providers, businesses, not-for-profit and "
+         + "educational organizations, and end-user systems. SpamAssassin also forms the basis for numerous "
+         + "commercial anti-spam products available on the market today."}}; 
+
+
+  private ClassifierData() { }
+  
+  public static void writeDataToFile(String file, String[][] content) throws IOException {
+    BufferedWriter writer = new BufferedWriter(new FileWriter(file));
+    for (String[] entry : content) {
+      writer.write(entry[0] + "\t" + entry[1] + "\n");
+    }
+    writer.close();
+  }
+
+  public static void writeDataToSequenceFile(String file,
+                                             String[][] content,
+                                             FileSystem fs,
+                                             Configuration conf) throws IOException {
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(file), Text.class, Text.class);
+    for (String[] entry : content) {
+      writer.append(new Text(entry[0]), new Text(entry[1]));
+    }
+    writer.close();
+  }
+}

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java?rev=910285&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/bayes/BayesClassifierSelfTest.java Mon Feb 15 18:27:31 2010
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.bayes;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.ClassifierData;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.classifier.bayes.algorithm.BayesAlgorithm;
+import org.apache.mahout.classifier.bayes.algorithm.CBayesAlgorithm;
+import org.apache.mahout.classifier.bayes.common.BayesParameters;
+import org.apache.mahout.classifier.bayes.datastore.InMemoryBayesDatastore;
+import org.apache.mahout.classifier.bayes.exceptions.InvalidDatastoreException;
+import org.apache.mahout.classifier.bayes.interfaces.Algorithm;
+import org.apache.mahout.classifier.bayes.interfaces.Datastore;
+import org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver;
+import org.apache.mahout.classifier.bayes.model.ClassifierContext;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.nlp.NGrams;
+
+public class BayesClassifierSelfTest extends MahoutTestCase {
+  
+  @Override
+  protected void setUp() throws Exception {
+    super.setUp();
+    ClassifierData.writeDataToFile("testdata/bayesinput", ClassifierData.DATA);
+  }
+  
+  public void testSelfTestBayes() throws InvalidDatastoreException, IOException {
+    BayesParameters params = new BayesParameters(1);
+    params.set("alpha_i", "1.0");
+    params.set("dataSource", "hdfs");
+    TrainClassifier.trainNaiveBayes("testdata/bayesinput", "testdata/bayesmodel", params);
+    
+    params.set("verbose", "true");
+    params.set("basePath", "testdata/bayesmodel");
+    params.set("classifierType", "bayes");
+    params.set("dataSource", "hdfs");
+    params.set("defaultCat", "unknown");
+    params.set("encoding", "UTF-8");
+    params.set("alpha_i", "1.0");
+    
+    Algorithm algorithm = new BayesAlgorithm();
+    Datastore datastore = new InMemoryBayesDatastore(params);
+    ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
+    classifier.initialize();
+    ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat"));
+    
+    for (String[] entry : ClassifierData.DATA) {
+      List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize")))
+          .generateNGramsWithoutLabel();
+      assertEquals(3, classifier.classifyDocument(document.toArray(new String[] {}),
+        params.get("defaultCat"), 100).length);
+      ClassifierResult result = classifier.classifyDocument(document.toArray(new String[] {}), params
+          .get("defaultCat"));
+      assertEquals(entry[0], result.getLabel());
+      resultAnalyzer.addInstance(entry[0], result);
+    }
+    int[][] matrix = resultAnalyzer.getConfusionMatrix().getConfusionMatrix();
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        if (i == j) assertEquals(4, matrix[i][j]);
+        else assertEquals(0, matrix[i][j]);
+      }
+    }
+    params.set("testDirPath", "testdata/bayesinput");
+    TestClassifier.classifyParallel(params);
+    Configuration conf = new Configuration();
+    Path outputFiles = new Path("testdata/bayesinput-output/part*");
+    FileSystem fs = FileSystem.get(outputFiles.toUri(), conf);
+    matrix = BayesClassifierDriver.readResult(fs, outputFiles, conf, params).getConfusionMatrix();
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        if (i == j) assertEquals(4, matrix[i][j]);
+        else assertEquals(0, matrix[i][j]);
+      }
+    }
+  }
+  
+  public void testSelfTestCBayes() throws InvalidDatastoreException, IOException {
+    BayesParameters params = new BayesParameters(1);
+    params.set("alpha_i", "1.0");
+    params.set("dataSource", "hdfs");
+    TrainClassifier.trainCNaiveBayes("testdata/bayesinput", "testdata/cbayesmodel", params);
+    
+    params.set("verbose", "true");
+    params.set("basePath", "testdata/cbayesmodel");
+    params.set("classifierType", "cbayes");
+    params.set("dataSource", "hdfs");
+    params.set("defaultCat", "unknown");
+    params.set("encoding", "UTF-8");
+    params.set("alpha_i", "1.0");
+    
+    Algorithm algorithm = new CBayesAlgorithm();
+    Datastore datastore = new InMemoryBayesDatastore(params);
+    ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
+    classifier.initialize();
+    ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat"));
+    for (String[] entry : ClassifierData.DATA) {
+      List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize")))
+          .generateNGramsWithoutLabel();
+      assertEquals(3, classifier.classifyDocument(document.toArray(new String[] {}),
+        params.get("defaultCat"), 100).length);
+      ClassifierResult result = classifier.classifyDocument(document.toArray(new String[] {}), params
+          .get("defaultCat"));
+      assertEquals(entry[0], result.getLabel());
+      resultAnalyzer.addInstance(entry[0], result);
+    }
+    int[][] matrix = resultAnalyzer.getConfusionMatrix().getConfusionMatrix();
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        if (i == j) assertEquals(4, matrix[i][j]);
+        else assertEquals(0, matrix[i][j]);
+      }
+    }
+    params.set("testDirPath", "testdata/bayesinput");
+    TestClassifier.classifyParallel(params);
+    Configuration conf = new Configuration();
+    Path outputFiles = new Path("testdata/bayesinput-output/part*");
+    FileSystem fs = FileSystem.get(outputFiles.toUri(), conf);
+    matrix = BayesClassifierDriver.readResult(fs, outputFiles, conf, params).getConfusionMatrix();
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        if (i == j) assertEquals(4, matrix[i][j]);
+        else assertEquals(0, matrix[i][j]);
+      }
+    }
+  }
+  
+}