You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/10 14:21:03 UTC
svn commit: r1156124 -
/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/MockBM25Similarity.java
Author: rmuir
Date: Wed Aug 10 12:21:03 2011
New Revision: 1156124
URL: http://svn.apache.org/viewvc?rev=1156124&view=rev
Log:
some cleanups/tuning for bm25
Modified:
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/MockBM25Similarity.java
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/MockBM25Similarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/MockBM25Similarity.java?rev=1156124&r1=1156123&r2=1156124&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/MockBM25Similarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/MockBM25Similarity.java Wed Aug 10 12:21:03 2011
@@ -40,14 +40,36 @@ public class MockBM25Similarity extends
/** Sets the default values for BM25:
* <ul>
- * <li>{@code k1 = 2.0},
+ * <li>{@code k1 = 1.2},
* <li>{@code b = 0.75}.</li>
* </ul>
*/
public MockBM25Similarity() {
- this.k1 = 2.0f;
+ this.k1 = 1.2f;
this.b = 0.75f;
}
+
+ /** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
+ public float idf(int docFreq, int numDocs) {
+ return (float) Math.log(1 + ((numDocs - docFreq + 0.5D)/(docFreq + 0.5D)));
+ }
+
+ /** Implemented as <code>1 / (distance + 1)</code>. */
+ public float sloppyFreq(int distance) {
+ return 1.0f / (distance + 1);
+ }
+
+ /** The default implementation returns <code>1</code> */
+ public float scorePayload(int doc, int start, int end, BytesRef payload) {
+ return 1;
+ }
+
+ /** return avg doc length across the field (or 1 if the codec does not store sumTotalTermFreq) */
+ public float avgFieldLength(IndexSearcher searcher, String field) throws IOException {
+ long sumTotalTermFreq = MultiFields.getTerms(searcher.getIndexReader(), field).getSumTotalTermFreq();
+ long maxdoc = searcher.getIndexReader().maxDoc();
+ return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
+ }
@Override
public byte computeNorm(FieldInvertState state) {
@@ -55,6 +77,14 @@ public class MockBM25Similarity extends
return encodeNormValue(state.getBoost() / (float) Math.sqrt(numTerms));
}
+ public float decodeNormValue(byte b) {
+ return NORM_TABLE[b & 0xFF];
+ }
+
+ public byte encodeNormValue(float f) {
+ return SmallFloat.floatToByte315(f);
+ }
+
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
@@ -64,63 +94,36 @@ public class MockBM25Similarity extends
NORM_TABLE[i] = 1.0f / (f*f);
}
}
-
- public float decodeNormValue(byte b) {
- return NORM_TABLE[b & 0xFF];
- }
- public byte encodeNormValue(float f) {
- return SmallFloat.floatToByte315(f);
- }
-
- // weight for a term as log(1 + ((n - dfj + 0.5F)/(dfj + 0.5F)))
- // TODO: are we summing this in the right place for phrase estimation????
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException {
float value = 0.0f;
- final StringBuilder exp = new StringBuilder();
-
final int max = searcher.maxDoc();
for (final TermContext stat : termStats ) {
- final int dfj = stat.docFreq();
- value += Math.log(1 + ((max - dfj + 0.5F)/(dfj + 0.5F)));
- exp.append(" ");
- exp.append(dfj);
+ value += idf(stat.docFreq(), max);
}
- return new BM25Stats(value, queryBoost, avgDocumentLength(searcher, fieldName));
+ return new BM25Stats(value, queryBoost, avgFieldLength(searcher, fieldName));
}
@Override
- public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
+ public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new ExactBM25DocScorer((BM25Stats) stats, context.reader.norms(fieldName));
}
@Override
- public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
+ public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new SloppyBM25DocScorer((BM25Stats) stats, context.reader.norms(fieldName));
}
- /** return avg doc length across the field (or 1 if the codec does not store sumTotalTermFreq) */
- private float avgDocumentLength(IndexSearcher searcher, String field) throws IOException {
- if (!searcher.reader.hasNorms(field)) {
- return 0f;
- } else {
- long sumTotalTermFreq = MultiFields.getTerms(searcher.reader, field).getSumTotalTermFreq();
- long maxdoc = searcher.reader.maxDoc();
- return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
- }
- }
-
private class ExactBM25DocScorer extends ExactDocScorer {
private final float weightValue;
private final byte[] norms;
private final float avgdl;
ExactBM25DocScorer(BM25Stats stats, byte norms[]) {
- // we incorporate boost here up front... maybe we should multiply by tf instead?
- this.weightValue = stats.idf * stats.queryBoost * stats.topLevelBoost;
+ this.weightValue = stats.weight;
this.avgdl = stats.avgdl;
this.norms = norms;
}
@@ -128,7 +131,8 @@ public class MockBM25Similarity extends
// todo: optimize
@Override
public float score(int doc, int freq) {
- float norm = norms == null ? 0 : k1 * ((1 - b) + b * (decodeNormValue(norms[doc])) / (avgdl));
+ // if there are no norms, we act as if b=0
+ float norm = norms == null ? k1 : k1 * ((1 - b) + b * (decodeNormValue(norms[doc])) / (avgdl));
return weightValue * (freq * (k1 + 1)) / (freq + norm);
}
}
@@ -139,8 +143,7 @@ public class MockBM25Similarity extends
private final float avgdl;
SloppyBM25DocScorer(BM25Stats stats, byte norms[]) {
- // we incorporate boost here up front... maybe we should multiply by tf instead?
- this.weightValue = stats.idf * stats.queryBoost * stats.topLevelBoost;
+ this.weightValue = stats.weight;
this.avgdl = stats.avgdl;
this.norms = norms;
}
@@ -148,33 +151,34 @@ public class MockBM25Similarity extends
// todo: optimize
@Override
public float score(int doc, float freq) {
- float norm = norms == null ? 0 : k1 * ((1 - b) + b * (decodeNormValue(norms[doc])) / (avgdl));
+ // if there are no norms, we act as if b=0
+ float norm = norms == null ? k1 : k1 * ((1 - b) + b * (decodeNormValue(norms[doc])) / (avgdl));
return weightValue * (freq * (k1 + 1)) / (freq + norm);
}
@Override
public float computeSlopFactor(int distance) {
- return 1.0f / (distance + 1);
+ return sloppyFreq(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
- return 1;
+ return scorePayload(doc, start, end, payload);
}
}
/** Collection statistics for the BM25 model. */
- public static class BM25Stats extends Stats {
+ private static class BM25Stats extends Stats {
/** BM25's idf */
private final float idf;
/** The average document length. */
private final float avgdl;
/** query's inner boost */
private final float queryBoost;
- /** any outer query's boost */
- private float topLevelBoost;
+ /** weight (idf * boost) */
+ private float weight;
- public BM25Stats(float idf, float queryBoost, float avgdl) {
+ BM25Stats(float idf, float queryBoost, float avgdl) {
this.idf = idf;
this.queryBoost = queryBoost;
this.avgdl = avgdl;
@@ -190,7 +194,7 @@ public class MockBM25Similarity extends
@Override
public void normalize(float queryNorm, float topLevelBoost) {
// we don't normalize with queryNorm at all, we just capture the top-level boost
- this.topLevelBoost = topLevelBoost;
+ this.weight = idf * queryBoost * topLevelBoost;
}
}
}