You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/11/10 14:14:31 UTC

[4/4] lucene-solr:master: LUCENE-7524: More detailed explanation of idf.

LUCENE-7524: More detailed explanation of idf.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6b9f1131
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6b9f1131
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6b9f1131

Branch: refs/heads/master
Commit: 6b9f11311a2481743d8b2b42a875a11162a5e215
Parents: 3e15233
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Nov 10 14:08:26 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Nov 10 15:01:49 2016 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +++
 .../search/similarities/BM25Similarity.java     | 20 ++++++++++----------
 .../search/similarities/ClassicSimilarity.java  | 13 +++++++++++++
 .../search/similarities/TFIDFSimilarity.java    | 12 +++++-------
 .../org/apache/lucene/search/CheckHits.java     | 10 ++++++----
 5 files changed, 37 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 97e6939..5b9fee7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -77,6 +77,9 @@ Improvements
   now throws IllegalArgumentException instead of a cryptic exception
   that closes your IndexWriter (Steve Chen via Mike McCandless)
 
+* LUCENE-7524: Added more detailed explanation of how IDF is computed in
+  ClassicSimilarity and BM25Similarity. (Adrien Grand)
+
 ======================= Lucene 6.3.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index ff390b3..6763118 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -175,7 +175,9 @@ public class BM25Similarity extends Similarity {
     final long df = termStats.docFreq();
     final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
     final float idf = idf(df, docCount);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
+    return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
+        Explanation.match(df, "docFreq"),
+        Explanation.match(docCount, "docCount"));
   }
 
   /**
@@ -192,16 +194,14 @@ public class BM25Similarity extends Similarity {
    *         for each term.
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
-    float idf = 0.0f;
+    double idf = 0d; // sum into a double before casting into a float
     List<Explanation> details = new ArrayList<>();
     for (final TermStatistics stat : termStats ) {
-      final long df = stat.docFreq();
-      final float termIdf = idf(df, docCount);
-      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
-      idf += termIdf;
+      Explanation idfExplain = idfExplain(collectionStats, stat);
+      details.add(idfExplain);
+      idf += idfExplain.getValue();
     }
-    return Explanation.match(idf, "idf(), sum of:", details);
+    return Explanation.match((float) idf, "idf(), sum of:", details);
   }
 
   @Override
@@ -303,7 +303,7 @@ public class BM25Similarity extends Similarity {
       subs.add(Explanation.match(0, "parameter b (norms omitted for field)"));
       return Explanation.match(
           (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1),
-          "tfNorm, computed from:", subs);
+          "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs);
     } else {
       byte norm;
       if (norms.advanceExact(doc)) {
@@ -317,7 +317,7 @@ public class BM25Similarity extends Similarity {
       subs.add(Explanation.match(doclen, "fieldLength"));
       return Explanation.match(
           (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)),
-          "tfNorm, computed from:", subs);
+          "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
index fae85a0..5a1e237 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
@@ -18,6 +18,9 @@ package org.apache.lucene.search.similarities;
 
 
 import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SmallFloat;
 
@@ -121,6 +124,16 @@ public class ClassicSimilarity extends TFIDFSimilarity {
     return 1;
   }
 
+  @Override
+  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
+    final long df = termStats.docFreq();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+    final float idf = idf(df, docCount);
+    return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
+        Explanation.match(df, "docFreq"),
+        Explanation.match(docCount, "docCount"));
+  }
+
   /** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */
   @Override
   public float idf(long docFreq, long docCount) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
index cd8acd6..12ab1a2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@@ -484,16 +484,14 @@ public abstract class TFIDFSimilarity extends Similarity {
    *         for each term.
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
-    float idf = 0.0f;
+    double idf = 0d; // sum into a double before casting into a float
     List<Explanation> subs = new ArrayList<>();
     for (final TermStatistics stat : termStats ) {
-      final long df = stat.docFreq();
-      final float termIdf = idf(df, docCount);
-      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
-      idf += termIdf;
+      Explanation idfExplain = idfExplain(collectionStats, stat);
+      subs.add(idfExplain);
+      idf += idfExplain.getValue();
     }
-    return Explanation.match(idf, "idf(), sum of:", subs);
+    return Explanation.match((float) idf, "idf(), sum of:", subs);
   }
 
   /** Computes a score factor based on a term's document frequency (the number

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
index 9a14b5d..dee7d84 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
@@ -368,8 +368,9 @@ public class CheckHits {
         boolean productOf = descr.endsWith("product of:");
         boolean sumOf = descr.endsWith("sum of:");
         boolean maxOf = descr.endsWith("max of:");
+        boolean computedOf = descr.matches(".*, computed as .* from:");
         boolean maxTimesOthers = false;
-        if (!(productOf || sumOf || maxOf)) {
+        if (!(productOf || sumOf || maxOf || computedOf)) {
           // maybe 'max plus x times others'
           int k1 = descr.indexOf("max plus ");
           if (k1>=0) {
@@ -387,9 +388,9 @@ public class CheckHits {
         // TODO: this is a TERRIBLE assertion!!!!
         Assert.assertTrue(
             q+": multi valued explanation description=\""+descr
-            +"\" must be 'max of plus x times others' or end with 'product of'"
+            +"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
             +" or 'sum of:' or 'max of:' - "+expl,
-            productOf || sumOf || maxOf || maxTimesOthers);
+            productOf || sumOf || maxOf || computedOf || maxTimesOthers);
         float sum = 0;
         float product = 1;
         float max = 0;
@@ -410,7 +411,8 @@ public class CheckHits {
         } else if (maxTimesOthers) {
           combined = max + x * (sum - max);
         } else {
-            Assert.assertTrue("should never get here!",false);
+          Assert.assertTrue("should never get here!", computedOf);
+          combined = value;
         }
         Assert.assertEquals(q+": actual subDetails combined=="+combined+
             " != value="+value+" Explanation: "+expl,