You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2017/04/07 08:59:41 UTC

[1/3] lucene-solr:master: LUCENE-5548 - improved testing for SNBC

Repository: lucene-solr
Updated Branches:
  refs/heads/master c05ab96dc -> f37fad206


LUCENE-5548 - improved testing for SNBC


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f37fad20
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f37fad20
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f37fad20

Branch: refs/heads/master
Commit: f37fad206b0dbf32209bff3761179458a1ddf7bf
Parents: 276ccff
Author: Tommaso Teofili <to...@apache.org>
Authored: Fri Apr 7 10:58:49 2017 +0200
Committer: Tommaso Teofili <to...@apache.org>
Committed: Fri Apr 7 10:59:20 2017 +0200

----------------------------------------------------------------------
 .../classification/SimpleNaiveBayesClassifierTest.java   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f37fad20/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
----------------------------------------------------------------------
diff --git a/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
index 2b4873d..0e05d4f 100644
--- a/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
@@ -59,8 +59,10 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
     try {
       MockAnalyzer analyzer = new MockAnalyzer(random());
       leafReader = getSampleIndex(analyzer);
-      TermQuery query = new TermQuery(new Term(textFieldName, "it"));
-      checkCorrectClassification(new SimpleNaiveBayesClassifier(leafReader, analyzer, query, categoryFieldName, textFieldName), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
+      TermQuery query = new TermQuery(new Term(textFieldName, "a"));
+      SimpleNaiveBayesClassifier classifier = new SimpleNaiveBayesClassifier(leafReader, analyzer, query, categoryFieldName, textFieldName);
+      checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
+      checkCorrectClassification(classifier, POLITICS_INPUT, POLITICS_RESULT);
     } finally {
       if (leafReader != null) {
         leafReader.close();
@@ -112,6 +114,11 @@ public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase<Bytes
       assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
       double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
       assertTrue("avg classification time: " + avgClassificationTime, 5000 > avgClassificationTime);
+
+      double f1 = confusionMatrix.getF1Measure();
+      assertTrue(f1 >= 0d);
+      assertTrue(f1 <= 1d);
+
       double accuracy = confusionMatrix.getAccuracy();
       assertTrue(accuracy >= 0d);
       assertTrue(accuracy <= 1d);


[3/3] lucene-solr:master: LUCENE-6853 - renamed threshold to bias, initialize to avg tf

Posted by to...@apache.org.
LUCENE-6853 - renamed threshold to bias, initialize to avg tf


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cbad533d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cbad533d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cbad533d

Branch: refs/heads/master
Commit: cbad533d7a44a5fd41f85756c791f3d7439861a2
Parents: c05ab96
Author: Tommaso Teofili <to...@apache.org>
Authored: Thu Apr 6 19:05:52 2017 +0200
Committer: Tommaso Teofili <to...@apache.org>
Committed: Fri Apr 7 10:59:20 2017 +0200

----------------------------------------------------------------------
 .../BooleanPerceptronClassifier.java            | 26 ++++++++++----------
 .../BooleanPerceptronClassifierTest.java        | 14 +++++++----
 2 files changed, 22 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cbad533d/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
----------------------------------------------------------------------
diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
index 781a14f..928c036 100644
--- a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
+++ b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java
@@ -58,7 +58,7 @@ import org.apache.lucene.util.fst.Util;
  */
 public class BooleanPerceptronClassifier implements Classifier<Boolean> {
 
-  private final Double threshold;
+  private final Double bias;
   private final Terms textTerms;
   private final Analyzer analyzer;
   private final String textFieldName;
@@ -72,14 +72,14 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
    * @param query          a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
    *                       if all the indexed docs should be used
    * @param batchSize      the size of the batch of docs to use for updating the perceptron weights
-   * @param threshold      the threshold used for class separation
+   * @param bias      the bias used for class separation
    * @param classFieldName the name of the field used as the output for the classifier
    * @param textFieldName  the name of the field used as input for the classifier
    * @throws IOException if the building of the underlying {@link FST} fails and / or {@link TermsEnum} for the text field
    *                     cannot be found
    */
   public BooleanPerceptronClassifier(IndexReader indexReader, Analyzer analyzer, Query query, Integer batchSize,
-                                     Double threshold, String classFieldName, String textFieldName) throws IOException {
+                                     Double bias, String classFieldName, String textFieldName) throws IOException {
     this.textTerms = MultiFields.getTerms(indexReader, textFieldName);
 
     if (textTerms == null) {
@@ -89,18 +89,18 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
     this.analyzer = analyzer;
     this.textFieldName = textFieldName;
 
-    if (threshold == null || threshold == 0d) {
-      // automatic assign a threshold
-      long sumDocFreq = indexReader.getSumDocFreq(textFieldName);
-      if (sumDocFreq != -1) {
-        this.threshold = (double) sumDocFreq / 2d;
+    if (bias == null || bias == 0d) {
+      // automatic assign the bias to be the average total term freq
+      double t = (double) indexReader.getSumTotalTermFreq(textFieldName) / (double) indexReader.getDocCount(textFieldName);
+      if (t != -1) {
+        this.bias = t;
       } else {
         throw new IOException(
-                "threshold cannot be assigned since term vectors for field "
+                "bias cannot be assigned since term vectors for field "
                         + textFieldName + " do not exist");
       }
     } else {
-      this.threshold = threshold;
+      this.bias = bias;
     }
 
     // TODO : remove this map as soon as we have a writable FST
@@ -173,7 +173,7 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
         // update weights
         Long previousValue = Util.get(fst, term);
         String termString = term.utf8ToString();
-        weights.put(termString, previousValue == null ? 0 : previousValue + modifier * termFreqLocal);
+        weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
       }
     }
     if (updateFST) {
@@ -216,8 +216,8 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
       tokenStream.end();
     }
 
-    double score = 1 - Math.exp(-1 * Math.abs(threshold - output.doubleValue()) / threshold);
-    return new ClassificationResult<>(output >= threshold, score);
+    double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias);
+    return new ClassificationResult<>(output >= bias, score);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cbad533d/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
----------------------------------------------------------------------
diff --git a/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
index 6ea92c0..ec059f7 100644
--- a/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
@@ -34,7 +34,9 @@ public class BooleanPerceptronClassifierTest extends ClassificationTestBase<Bool
     try {
       MockAnalyzer analyzer = new MockAnalyzer(random());
       leafReader = getSampleIndex(analyzer);
-      checkCorrectClassification(new BooleanPerceptronClassifier(leafReader, analyzer, null, 1, null, booleanFieldName, textFieldName), TECHNOLOGY_INPUT, false);
+      BooleanPerceptronClassifier classifier = new BooleanPerceptronClassifier(leafReader, analyzer, null, 1, null, booleanFieldName, textFieldName);
+      checkCorrectClassification(classifier, TECHNOLOGY_INPUT, false);
+      checkCorrectClassification(classifier, POLITICS_INPUT, true);
     } finally {
       if (leafReader != null) {
         leafReader.close();
@@ -60,12 +62,14 @@ public class BooleanPerceptronClassifierTest extends ClassificationTestBase<Bool
 
   @Test
   public void testBasicUsageWithQuery() throws Exception {
-    TermQuery query = new TermQuery(new Term(textFieldName, "it"));
+    TermQuery query = new TermQuery(new Term(textFieldName, "of"));
     LeafReader leafReader = null;
     try {
       MockAnalyzer analyzer = new MockAnalyzer(random());
       leafReader = getSampleIndex(analyzer);
-      checkCorrectClassification(new BooleanPerceptronClassifier(leafReader, analyzer, query, 1, null, booleanFieldName, textFieldName), TECHNOLOGY_INPUT, false);
+      BooleanPerceptronClassifier classifier = new BooleanPerceptronClassifier(leafReader, analyzer, query, 1, null, booleanFieldName, textFieldName);
+      checkCorrectClassification(classifier, TECHNOLOGY_INPUT, false);
+      checkCorrectClassification(classifier, POLITICS_INPUT, true);
     } finally {
       if (leafReader != null) {
         leafReader.close();
@@ -94,8 +98,8 @@ public class BooleanPerceptronClassifierTest extends ClassificationTestBase<Bool
       double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
       assertTrue(5000 > avgClassificationTime);
       // accuracy check disabled until LUCENE-6853 is fixed
-//      double accuracy = confusionMatrix.getAccuracy();
-//      assertTrue(accuracy > 0d);
+      double accuracy = confusionMatrix.getAccuracy();
+      assertTrue(accuracy > 0d);
     } finally {
       leafReader.close();
     }


[2/3] lucene-solr:master: LUCENE-6853 - re-enabled test classification measures for bpc

Posted by to...@apache.org.
LUCENE-6853 - re-enabled test classification measures for bpc


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/276ccff7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/276ccff7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/276ccff7

Branch: refs/heads/master
Commit: 276ccff751dc0f5cd0599dd5a6682553a0b58d7b
Parents: cbad533
Author: Tommaso Teofili <to...@apache.org>
Authored: Thu Apr 6 19:13:50 2017 +0200
Committer: Tommaso Teofili <to...@apache.org>
Committed: Fri Apr 7 10:59:20 2017 +0200

----------------------------------------------------------------------
 .../BooleanPerceptronClassifierTest.java        | 39 ++++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/276ccff7/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
----------------------------------------------------------------------
diff --git a/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java b/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
index ec059f7..5ecf9c6 100644
--- a/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
+++ b/lucene/classification/src/test/org/apache/lucene/classification/BooleanPerceptronClassifierTest.java
@@ -19,8 +19,12 @@ package org.apache.lucene.classification;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.classification.utils.ConfusionMatrixGenerator;
 import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.BytesRef;
 import org.junit.Test;
 
 /**
@@ -90,16 +94,45 @@ public class BooleanPerceptronClassifierTest extends ClassificationTestBase<Bool
 
       long evaluationStart = System.currentTimeMillis();
       ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
-          classifier, categoryFieldName, textFieldName, -1);
+          classifier, booleanFieldName, textFieldName, -1);
       assertNotNull(confusionMatrix);
       long evaluationEnd = System.currentTimeMillis();
       long evaluationTime = evaluationEnd - evaluationStart;
       assertTrue("evaluation took more than 1m: " + evaluationTime / 1000 + "s", evaluationTime < 60000);
       double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
       assertTrue(5000 > avgClassificationTime);
-      // accuracy check disabled until LUCENE-6853 is fixed
+
+      double f1 = confusionMatrix.getF1Measure();
+      assertTrue(f1 >= 0d);
+      assertTrue(f1 <= 1d);
+
       double accuracy = confusionMatrix.getAccuracy();
-      assertTrue(accuracy > 0d);
+      assertTrue(accuracy >= 0d);
+      assertTrue(accuracy <= 1d);
+
+      double recall = confusionMatrix.getRecall();
+      assertTrue(recall >= 0d);
+      assertTrue(recall <= 1d);
+
+      double precision = confusionMatrix.getPrecision();
+      assertTrue(precision >= 0d);
+      assertTrue(precision <= 1d);
+
+      Terms terms = MultiFields.getTerms(leafReader, booleanFieldName);
+      TermsEnum iterator = terms.iterator();
+      BytesRef term;
+      while ((term = iterator.next()) != null) {
+        String s = term.utf8ToString();
+        recall = confusionMatrix.getRecall(s);
+        assertTrue(recall >= 0d);
+        assertTrue(recall <= 1d);
+        precision = confusionMatrix.getPrecision(s);
+        assertTrue(precision >= 0d);
+        assertTrue(precision <= 1d);
+        double f1Measure = confusionMatrix.getF1Measure(s);
+        assertTrue(f1Measure >= 0d);
+        assertTrue(f1Measure <= 1d);
+      }
     } finally {
       leafReader.close();
     }