You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/08 22:22:37 UTC
svn commit: r1155078 - in
/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities:
DFRSimilarity.java EasySimilarity.java IBSimilarity.java
LMDirichletSimilarity.java LMJelinekMercerSimilarity.java LMSimilarity.java
Author: rmuir
Date: Mon Aug 8 20:22:36 2011
New Revision: 1155078
URL: http://svn.apache.org/viewvc?rev=1155078&view=rev
Log:
LUCENE-3220: let EasySim's api take a doclen instead of norm, incorporate discountOverlaps
Modified:
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java Mon Aug 8 20:22:36 2011
@@ -78,21 +78,20 @@ public class DFRSimilarity extends EasyS
}
@Override
- protected float score(EasyStats stats, float freq, byte norm) {
- float tfn = normalization.tfn(stats, freq, decodeNormValue(norm));
+ protected float score(EasyStats stats, float freq, int docLen) {
+ float tfn = normalization.tfn(stats, freq, docLen);
return stats.getTotalBoost() *
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
}
@Override
protected void explain(Explanation expl,
- EasyStats stats, int doc, float freq, byte norm) {
+ EasyStats stats, int doc, float freq, int docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
- int len = decodeNormValue(norm);
- Explanation normExpl = normalization.explain(stats, freq, len);
+ Explanation normExpl = normalization.explain(stats, freq, docLen);
float tfn = normExpl.getValue();
expl.addDetail(normExpl);
expl.addDetail(basicModel.explain(stats, tfn));
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java Mon Aug 8 20:22:36 2011
@@ -39,6 +39,27 @@ public abstract class EasySimilarity ext
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
private static final double LOG_2 = Math.log(2);
+ /** @see #setDiscountOverlaps */
+ protected boolean discountOverlaps = true;
+
+ /** Determines whether overlap tokens (Tokens with
+ * 0 position increment) are ignored when computing
+ * norm. By default this is true, meaning overlap
+ * tokens do not count when computing norms.
+ *
+ * @lucene.experimental
+ *
+ * @see #computeNorm
+ */
+ public void setDiscountOverlaps(boolean v) {
+ discountOverlaps = v;
+ }
+
+ /** @see #setDiscountOverlaps */
+ public boolean getDiscountOverlaps() {
+ return discountOverlaps;
+ }
+
/**
* Calls {@link #fillEasyStats(EasyStats, IndexSearcher, String, TermContext...)}.
* Subclasses that override this method may invoke {@code fillStats} with any
@@ -83,10 +104,10 @@ public abstract class EasySimilarity ext
* <p>Subclasses must apply their scoring formula in this class.</p>
* @param stats the corpus level statistics.
* @param freq the term frequency.
- * @param norm the current document's field norm.
+ * @param docLen the document length.
* @return the score.
*/
- protected abstract float score(EasyStats stats, float freq, byte norm);
+ protected abstract float score(EasyStats stats, float freq, int docLen);
/**
* Subclasses should implement this method to explain the score. {@code expl}
@@ -99,10 +120,10 @@ public abstract class EasySimilarity ext
* @param stats the corpus level statistics.
* @param doc the document id.
* @param freq the term frequency.
- * @param norm the current document's field norm.
+ * @param docLen the document length.
*/
protected void explain(
- Explanation expl, EasyStats stats, int doc, float freq, byte norm) {}
+ Explanation expl, EasyStats stats, int doc, float freq, int docLen) {}
/**
* Explains the score. The implementation here provides a basic explanation
@@ -116,18 +137,18 @@ public abstract class EasySimilarity ext
* @param stats the corpus level statistics.
* @param doc the document id.
* @param freq the term frequency and its explanation.
- * @param norm the current document's field norm.
+ * @param docLen the document length.
* @return the explanation.
*/
protected Explanation explain(
- EasyStats stats, int doc, Explanation freq, byte norm) {
+ EasyStats stats, int doc, Explanation freq, int docLen) {
Explanation result = new Explanation();
- result.setValue(score(stats, freq.getValue(), norm));
+ result.setValue(score(stats, freq.getValue(), docLen));
result.setDescription("score(" + getClass().getSimpleName() +
", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:");
result.addDetail(freq);
- explain(result, stats, doc, freq.getValue(), norm);
+ explain(result, stats, doc, freq.getValue(), docLen);
return result;
}
@@ -148,24 +169,24 @@ public abstract class EasySimilarity ext
// ------------------------------ Norm handling ------------------------------
- /** Cache of decoded bytes. */
- private static final float[] NORM_TABLE = new float[256];
+ /** Norm -> document length map. */
+ private static final int[] NORM_TABLE = new int[256];
static {
- for (int i = 0; i < 256; i++)
- NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+ for (int i = 0; i < 256; i++) {
+ float floatNorm = SmallFloat.byte315ToFloat((byte)i);
+ NORM_TABLE[i] = (int)(1.0 / (floatNorm * floatNorm));
+ }
}
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
@Override
public byte computeNorm(FieldInvertState state) {
- final int numTerms;
- // nocommit: to include discountOverlaps?
-// if (discountOverlaps)
-// numTerms = state.getLength() - state.getNumOverlap();
-// else
- numTerms = state.getLength();
-// return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
+ final float numTerms;
+ if (discountOverlaps)
+ numTerms = state.getLength() - state.getNumOverlap();
+ else
+ numTerms = state.getLength() / state.getBoost();
return encodeNormValue(numTerms);
}
@@ -173,15 +194,13 @@ public abstract class EasySimilarity ext
* @see #encodeNormValue(float)
*/
// nocommit to protected?
- // nocommit is int OK?
public int decodeNormValue(byte norm) {
- float floatNorm = NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
- return (int)(1.0 / (floatNorm * floatNorm));
+ return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes the length to a byte via SmallFloat. */
// nocommit to protected?
- public byte encodeNormValue(int length) {
+ public byte encodeNormValue(float length) {
return SmallFloat.floatToByte315((float)(1.0 / Math.sqrt(length)));
}
@@ -212,12 +231,13 @@ public abstract class EasySimilarity ext
@Override
public float score(int doc, int freq) {
- return EasySimilarity.this.score(stats, freq, norms[doc]);
+ return EasySimilarity.this.score(stats, freq, decodeNormValue(norms[doc]));
}
@Override
public Explanation explain(int doc, Explanation freq) {
- return EasySimilarity.this.explain(stats, doc, freq, norms[doc]);
+ return EasySimilarity.this.explain(
+ stats, doc, freq, decodeNormValue(norms[doc]));
}
}
@@ -239,11 +259,12 @@ public abstract class EasySimilarity ext
// todo: optimize
@Override
public float score(int doc, float freq) {
- return EasySimilarity.this.score(stats, freq, norms[doc]);
+ return EasySimilarity.this.score(stats, freq, decodeNormValue(norms[doc]));
}
@Override
public Explanation explain(int doc, Explanation freq) {
- return EasySimilarity.this.explain(stats, doc, freq, norms[doc]);
+ return EasySimilarity.this.explain(
+ stats, doc, freq, decodeNormValue(norms[doc]));
}
@Override
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java Mon Aug 8 20:22:36 2011
@@ -63,22 +63,21 @@ public class IBSimilarity extends EasySi
}
@Override
- protected float score(EasyStats stats, float freq, byte norm) {
+ protected float score(EasyStats stats, float freq, int docLen) {
return stats.getTotalBoost() *
distribution.score(
stats,
- normalization.tfn(stats, freq, decodeNormValue(norm)),
+ normalization.tfn(stats, freq, docLen),
lambda.lambda(stats));
}
@Override
protected void explain(
- Explanation expl, EasyStats stats, int doc, float freq, byte norm) {
+ Explanation expl, EasyStats stats, int doc, float freq, int docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
- int len = decodeNormValue(norm);
- Explanation normExpl = normalization.explain(stats, freq, len);
+ Explanation normExpl = normalization.explain(stats, freq, docLen);
Explanation lambdaExpl = lambda.explain(stats);
expl.addDetail(normExpl);
expl.addDetail(lambdaExpl);
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java Mon Aug 8 20:22:36 2011
@@ -54,16 +54,16 @@ public class LMDirichletSimilarity exten
}
@Override
- protected float score(EasyStats stats, float freq, byte norm) {
+ protected float score(EasyStats stats, float freq, int docLen) {
return stats.getTotalBoost() *
(float)(Math.log(1 + freq /
(mu * ((LMStats)stats).getCollectionProbability())) +
- Math.log(mu / (decodeNormValue(norm) + mu)));
+ Math.log(mu / (docLen + mu)));
}
@Override
protected void explain(Explanation expl, EasyStats stats, int doc,
- float freq, byte norm) {
+ float freq, int docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
@@ -75,8 +75,8 @@ public class LMDirichletSimilarity exten
weightExpl.setDescription("term weight");
expl.addDetail(weightExpl);
expl.addDetail(new Explanation(
- (float)Math.log(mu / (decodeNormValue(norm) + mu)), "document norm"));
- super.explain(expl, stats, doc, freq, norm);
+ (float)Math.log(mu / (docLen + mu)), "document norm"));
+ super.explain(expl, stats, doc, freq, docLen);
}
/** Returns the μ parameter. */
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java Mon Aug 8 20:22:36 2011
@@ -26,6 +26,9 @@ import org.apache.lucene.search.similari
* models applied to Ad Hoc information retrieval. In Proceedings of the 24th
* annual international ACM SIGIR conference on Research and development in
* information retrieval (SIGIR '01). ACM, New York, NY, USA, 334-342.
+ * <p>The model has a single parameter, λ. According to said paper, the
+ * optimal value depends on both the collection and the query. The optimal value
+ * is around {@code 0.1} for title queries and {@code 0.7} for long queries.</p>
*
* @lucene.experimental
*/
@@ -46,20 +49,20 @@ public class LMJelinekMercerSimilarity e
}
@Override
- protected float score(EasyStats stats, float freq, byte norm) {
+ protected float score(EasyStats stats, float freq, int docLen) {
return stats.getTotalBoost() *
(float)Math.log(1 +
- ((1 - lambda) * freq / decodeNormValue(norm)) /
+ ((1 - lambda) * freq / docLen) /
(lambda * ((LMStats)stats).getCollectionProbability()));
}
@Override
protected void explain(Explanation expl, EasyStats stats, int doc,
- float freq, byte norm) {
+ float freq, int docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
- super.explain(expl, stats, doc, freq, norm);
+ super.explain(expl, stats, doc, freq, docLen);
}
/** Returns the λ parameter. */
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java Mon Aug 8 20:22:36 2011
@@ -67,7 +67,7 @@ public abstract class LMSimilarity exten
@Override
protected void explain(Explanation expl, EasyStats stats, int doc,
- float freq, byte norm) {
+ float freq, int docLen) {
expl.addDetail(new Explanation(collectionModel.computeProbability(stats),
"collection probability"));
}