You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/11/10 14:14:31 UTC
[4/4] lucene-solr:master: LUCENE-7524: More detailed explanation of
idf.
LUCENE-7524: More detailed explanation of idf.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6b9f1131
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6b9f1131
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6b9f1131
Branch: refs/heads/master
Commit: 6b9f11311a2481743d8b2b42a875a11162a5e215
Parents: 3e15233
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Nov 10 14:08:26 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Nov 10 15:01:49 2016 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
.../search/similarities/BM25Similarity.java | 20 ++++++++++----------
.../search/similarities/ClassicSimilarity.java | 13 +++++++++++++
.../search/similarities/TFIDFSimilarity.java | 12 +++++-------
.../org/apache/lucene/search/CheckHits.java | 10 ++++++----
5 files changed, 37 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 97e6939..5b9fee7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -77,6 +77,9 @@ Improvements
now throws IllegalArgumentException instead of a cryptic exception
that closes your IndexWriter (Steve Chen via Mike McCandless)
+* LUCENE-7524: Added more detailed explanation of how IDF is computed in
+ ClassicSimilarity and BM25Similarity. (Adrien Grand)
+
======================= Lucene 6.3.0 =======================
API Changes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index ff390b3..6763118 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -175,7 +175,9 @@ public class BM25Similarity extends Similarity {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
- return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
+ return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
+ Explanation.match(df, "docFreq"),
+ Explanation.match(docCount, "docCount"));
}
/**
@@ -192,16 +194,14 @@ public class BM25Similarity extends Similarity {
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
- final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
- float idf = 0.0f;
+ double idf = 0d; // sum into a double before casting into a float
List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats ) {
- final long df = stat.docFreq();
- final float termIdf = idf(df, docCount);
- details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
- idf += termIdf;
+ Explanation idfExplain = idfExplain(collectionStats, stat);
+ details.add(idfExplain);
+ idf += idfExplain.getValue();
}
- return Explanation.match(idf, "idf(), sum of:", details);
+ return Explanation.match((float) idf, "idf(), sum of:", details);
}
@Override
@@ -303,7 +303,7 @@ public class BM25Similarity extends Similarity {
subs.add(Explanation.match(0, "parameter b (norms omitted for field)"));
return Explanation.match(
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1),
- "tfNorm, computed from:", subs);
+ "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs);
} else {
byte norm;
if (norms.advanceExact(doc)) {
@@ -317,7 +317,7 @@ public class BM25Similarity extends Similarity {
subs.add(Explanation.match(doclen, "fieldLength"));
return Explanation.match(
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)),
- "tfNorm, computed from:", subs);
+ "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
index fae85a0..5a1e237 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
@@ -18,6 +18,9 @@ package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
@@ -121,6 +124,16 @@ public class ClassicSimilarity extends TFIDFSimilarity {
return 1;
}
+ @Override
+ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
+ final long df = termStats.docFreq();
+ final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+ final float idf = idf(df, docCount);
+ return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
+ Explanation.match(df, "docFreq"),
+ Explanation.match(docCount, "docCount"));
+ }
+
/** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */
@Override
public float idf(long docFreq, long docCount) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
index cd8acd6..12ab1a2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@@ -484,16 +484,14 @@ public abstract class TFIDFSimilarity extends Similarity {
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
- final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
- float idf = 0.0f;
+ double idf = 0d; // sum into a double before casting into a float
List<Explanation> subs = new ArrayList<>();
for (final TermStatistics stat : termStats ) {
- final long df = stat.docFreq();
- final float termIdf = idf(df, docCount);
- subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
- idf += termIdf;
+ Explanation idfExplain = idfExplain(collectionStats, stat);
+ subs.add(idfExplain);
+ idf += idfExplain.getValue();
}
- return Explanation.match(idf, "idf(), sum of:", subs);
+ return Explanation.match((float) idf, "idf(), sum of:", subs);
}
/** Computes a score factor based on a term's document frequency (the number
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6b9f1131/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
index 9a14b5d..dee7d84 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java
@@ -368,8 +368,9 @@ public class CheckHits {
boolean productOf = descr.endsWith("product of:");
boolean sumOf = descr.endsWith("sum of:");
boolean maxOf = descr.endsWith("max of:");
+ boolean computedOf = descr.matches(".*, computed as .* from:");
boolean maxTimesOthers = false;
- if (!(productOf || sumOf || maxOf)) {
+ if (!(productOf || sumOf || maxOf || computedOf)) {
// maybe 'max plus x times others'
int k1 = descr.indexOf("max plus ");
if (k1>=0) {
@@ -387,9 +388,9 @@ public class CheckHits {
// TODO: this is a TERRIBLE assertion!!!!
Assert.assertTrue(
q+": multi valued explanation description=\""+descr
- +"\" must be 'max of plus x times others' or end with 'product of'"
+ +"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
+" or 'sum of:' or 'max of:' - "+expl,
- productOf || sumOf || maxOf || maxTimesOthers);
+ productOf || sumOf || maxOf || computedOf || maxTimesOthers);
float sum = 0;
float product = 1;
float max = 0;
@@ -410,7 +411,8 @@ public class CheckHits {
} else if (maxTimesOthers) {
combined = max + x * (sum - max);
} else {
- Assert.assertTrue("should never get here!",false);
+ Assert.assertTrue("should never get here!", computedOf);
+ combined = value;
}
Assert.assertEquals(q+": actual subDetails combined=="+combined+
" != value="+value+" Explanation: "+expl,