You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2013/04/08 12:15:13 UTC

svn commit: r1465586 - in /lucene/dev/branches/branch_4x: ./ dev-tools/ lucene/ lucene/analysis/ lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/ lucene/benchmark/ lucene/classification/ lucene/classification/src/ lucene/clas...

Author: tommaso
Date: Mon Apr  8 10:15:12 2013
New Revision: 1465586

URL: http://svn.apache.org/r1465586
Log:
LUCENE-4917 - merged back to branch_4x

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/BUILD.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/MIGRATE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/README.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/   (props changed)
    lucene/dev/branches/branch_4x/lucene/build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/ivy.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/src/   (props changed)
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
    lucene/dev/branches/branch_4x/lucene/codecs/   (props changed)
    lucene/dev/branches/branch_4x/lucene/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSort.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/demo/   (props changed)
    lucene/dev/branches/branch_4x/lucene/facet/   (props changed)
    lucene/dev/branches/branch_4x/lucene/grouping/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/ivy-settings.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/join/   (props changed)
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)
    lucene/dev/branches/branch_4x/lucene/memory/   (props changed)
    lucene/dev/branches/branch_4x/lucene/misc/   (props changed)
    lucene/dev/branches/branch_4x/lucene/module-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/queries/   (props changed)
    lucene/dev/branches/branch_4x/lucene/queryparser/   (props changed)
    lucene/dev/branches/branch_4x/lucene/sandbox/   (props changed)
    lucene/dev/branches/branch_4x/lucene/site/   (props changed)
    lucene/dev/branches/branch_4x/lucene/spatial/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/tools/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/README.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/cloud-dev/   (props changed)
    lucene/dev/branches/branch_4x/solr/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/core/TestConfig.java   (props changed)
    lucene/dev/branches/branch_4x/solr/example/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/scripts/   (props changed)
    lucene/dev/branches/branch_4x/solr/site/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/solr/webapp/   (props changed)

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?rev=1465586&r1=1465585&r2=1465586&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java Mon Apr  8 10:15:12 2013
@@ -58,6 +58,9 @@ public class KNearestNeighborClassifier 
    */
   @Override
   public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
+    if (mlt == null) {
+      throw new IOException("You must first call Classifier#train first");
+    }
     Query q = mlt.like(new StringReader(text), textFieldName);
     TopDocs topDocs = indexSearcher.search(q, k);
     return selectClassFromNeighbors(topDocs);

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1465586&r1=1465585&r2=1465586&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Mon Apr  8 10:15:12 2013
@@ -103,7 +103,7 @@ public class SimpleNaiveBayesClassifier 
   @Override
   public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
     if (atomicReader == null) {
-      throw new RuntimeException("need to train the classifier first");
+      throw new IOException("You must first call Classifier#train first");
     }
     double max = 0d;
     BytesRef foundClass = new BytesRef();

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java?rev=1465586&r1=1465585&r2=1465586&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java Mon Apr  8 10:15:12 2013
@@ -24,7 +24,6 @@ import org.apache.lucene.document.TextFi
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.SlowCompositeReaderWrapper;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.After;
 import org.junit.Before;
@@ -32,12 +31,13 @@ import org.junit.Before;
 /**
  * Base class for testing {@link Classifier}s
  */
-public abstract class ClassificationTestBase extends LuceneTestCase {
+public abstract class ClassificationTestBase<T> extends LuceneTestCase {
 
   private RandomIndexWriter indexWriter;
   private String textFieldName;
-  private String classFieldName;
   private Directory dir;
+  String categoryFieldName;
+  String booleanFieldName;
 
   @Before
   public void setUp() throws Exception {
@@ -45,7 +45,8 @@ public abstract class ClassificationTest
     dir = newDirectory();
     indexWriter = new RandomIndexWriter(random(), dir);
     textFieldName = "text";
-    classFieldName = "cat";
+    categoryFieldName = "cat";
+    booleanFieldName = "bool";
   }
 
   @After
@@ -56,17 +57,17 @@ public abstract class ClassificationTest
   }
 
 
-  protected void checkCorrectClassification(Classifier<BytesRef> classifier, Analyzer analyzer) throws Exception {
+  protected void checkCorrectClassification(Classifier<T> classifier, T expectedResult, Analyzer analyzer, String classFieldName) throws Exception {
     SlowCompositeReaderWrapper compositeReaderWrapper = null;
     try {
       populateIndex(analyzer);
       compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
       classifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
       String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more.";
-      ClassificationResult<BytesRef> classificationResult = classifier.assignClass(newText);
+      ClassificationResult<T> classificationResult = classifier.assignClass(newText);
       assertNotNull(classificationResult.getAssignedClass());
-      assertEquals(new BytesRef("technology"), classificationResult.getAssignedClass());
-      assertTrue(classificationResult.getScore() > 0);
+      assertEquals("got an assigned class of " + classificationResult.getAssignedClass(), expectedResult, classificationResult.getAssignedClass());
+      assertTrue("got a not positive score " + classificationResult.getScore(), classificationResult.getScore() > 0);
     } finally {
       if (compositeReaderWrapper != null)
         compositeReaderWrapper.close();
@@ -84,48 +85,55 @@ public abstract class ClassificationTest
     doc.add(new Field(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
         "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
         "the Unknown Soldier in Warsaw Tuesday.", ft));
-    doc.add(new Field(classFieldName, "politics", ft));
+    doc.add(new Field(categoryFieldName, "politics", ft));
+    doc.add(new Field(booleanFieldName, "false", ft));
 
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
     doc.add(new Field(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
         " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", ft));
-    doc.add(new Field(classFieldName, "politics", ft));
+    doc.add(new Field(categoryFieldName, "politics", ft));
+    doc.add(new Field(booleanFieldName, "false", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
     doc.add(new Field(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
         "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
         "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", ft));
-    doc.add(new Field(classFieldName, "politics", ft));
+    doc.add(new Field(categoryFieldName, "politics", ft));
+    doc.add(new Field(booleanFieldName, "false", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
     doc.add(new Field(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
         "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
         "Albany's School of Criminal Justice.", ft));
-    doc.add(new Field(classFieldName, "politics", ft));
+    doc.add(new Field(categoryFieldName, "politics", ft));
+    doc.add(new Field(booleanFieldName, "false", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
     doc.add(new Field(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
         "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
         "world through the Internet.", ft));
-    doc.add(new Field(classFieldName, "technology", ft));
+    doc.add(new Field(categoryFieldName, "technology", ft));
+    doc.add(new Field(booleanFieldName, "true", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
     doc.add(new Field(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
         "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", ft));
-    doc.add(new Field(classFieldName, "technology", ft));
+    doc.add(new Field(categoryFieldName, "technology", ft));
+    doc.add(new Field(booleanFieldName, "true", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
     doc.add(new Field(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
         " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
         "generally transfer or store huge volumes of personal data online.", ft));
-    doc.add(new Field(classFieldName, "technology", ft));
+    doc.add(new Field(categoryFieldName, "technology", ft));
+    doc.add(new Field(booleanFieldName, "true", ft));
     indexWriter.addDocument(doc, analyzer);
 
     indexWriter.commit();

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?rev=1465586&r1=1465585&r2=1465586&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java Mon Apr  8 10:15:12 2013
@@ -17,16 +17,17 @@
 package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.util.BytesRef;
 import org.junit.Test;
 
 /**
  * Testcase for {@link KNearestNeighborClassifier}
  */
-public class KNearestNeighborClassifierTest extends ClassificationTestBase {
+public class KNearestNeighborClassifierTest extends ClassificationTestBase<BytesRef> {
 
   @Test
   public void testBasicUsage() throws Exception {
-    checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
+     checkCorrectClassification(new KNearestNeighborClassifier(1), new BytesRef("technology"), new MockAnalyzer(random()), categoryFieldName);
   }
 
 }

Modified: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1465586&r1=1465585&r2=1465586&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Mon Apr  8 10:15:12 2013
@@ -19,6 +19,7 @@ package org.apache.lucene.classification
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
 
@@ -29,16 +30,16 @@ import java.io.Reader;
  */
 // TODO : eventually remove this if / when fallback methods exist for all un-supportable codec methods (see LUCENE-4872)
 @LuceneTestCase.SuppressCodecs("Lucene3x")
-public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<BytesRef> {
 
   @Test
   public void testBasicUsage() throws Exception {
-    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new BytesRef("technology"), new MockAnalyzer(random()), categoryFieldName);
   }
 
   @Test
   public void testNGramUsage() throws Exception {
-    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new BytesRef("technology"), new NGramAnalyzer(), categoryFieldName);
   }
 
   private class NGramAnalyzer extends Analyzer {