You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/08 22:22:37 UTC

svn commit: r1155078 - in /lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities: DFRSimilarity.java EasySimilarity.java IBSimilarity.java LMDirichletSimilarity.java LMJelinekMercerSimilarity.java LMSimilarity.java

Author: rmuir
Date: Mon Aug  8 20:22:36 2011
New Revision: 1155078

URL: http://svn.apache.org/viewvc?rev=1155078&view=rev
Log:
LUCENE-3220: let EasySim's api take a doclen instead of norm, incorporate discountOverlaps

Modified:
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DFRSimilarity.java Mon Aug  8 20:22:36 2011
@@ -78,21 +78,20 @@ public class DFRSimilarity extends EasyS
   }
   
   @Override
-  protected float score(EasyStats stats, float freq, byte norm) {
-    float tfn = normalization.tfn(stats, freq, decodeNormValue(norm));
+  protected float score(EasyStats stats, float freq, int docLen) {
+    float tfn = normalization.tfn(stats, freq, docLen);
     return stats.getTotalBoost() *
         basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
   }
 
   @Override
   protected void explain(Explanation expl,
-      EasyStats stats, int doc, float freq, byte norm) {
+      EasyStats stats, int doc, float freq, int docLen) {
     if (stats.getTotalBoost() != 1.0f) {
       expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
     }
     
-    int len = decodeNormValue(norm);
-    Explanation normExpl = normalization.explain(stats, freq, len);
+    Explanation normExpl = normalization.explain(stats, freq, docLen);
     float tfn = normExpl.getValue();
     expl.addDetail(normExpl);
     expl.addDetail(basicModel.explain(stats, tfn));

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java Mon Aug  8 20:22:36 2011
@@ -39,6 +39,27 @@ public abstract class EasySimilarity ext
   /** For {@link #log2(double)}. Precomputed for efficiency reasons. */
   private static final double LOG_2 = Math.log(2);
   
+  /** @see #setDiscountOverlaps */
+  protected boolean discountOverlaps = true;
+  
+  /** Determines whether overlap tokens (Tokens with
+   *  0 position increment) are ignored when computing
+   *  norm.  By default this is true, meaning overlap
+   *  tokens do not count when computing norms.
+   *
+   *  @lucene.experimental
+   *
+   *  @see #computeNorm
+   */
+  public void setDiscountOverlaps(boolean v) {
+    discountOverlaps = v;
+  }
+
+  /** @see #setDiscountOverlaps */
+  public boolean getDiscountOverlaps() {
+    return discountOverlaps;
+  }
+  
   /**
    * Calls {@link #fillEasyStats(EasyStats, IndexSearcher, String, TermContext...)}.
    * Subclasses that override this method may invoke {@code fillStats} with any
@@ -83,10 +104,10 @@ public abstract class EasySimilarity ext
    * <p>Subclasses must apply their scoring formula in this class.</p>
    * @param stats the corpus level statistics.
    * @param freq the term frequency.
-   * @param norm the current document's field norm.
+   * @param docLen the document length.
    * @return the score.
    */
-  protected abstract float score(EasyStats stats, float freq, byte norm);
+  protected abstract float score(EasyStats stats, float freq, int docLen);
   
   /**
    * Subclasses should implement this method to explain the score. {@code expl}
@@ -99,10 +120,10 @@ public abstract class EasySimilarity ext
    * @param stats the corpus level statistics.
    * @param doc the document id.
    * @param freq the term frequency.
-   * @param norm the current document's field norm.
+   * @param docLen the document length.
    */
   protected void explain(
-      Explanation expl, EasyStats stats, int doc, float freq, byte norm) {}
+      Explanation expl, EasyStats stats, int doc, float freq, int docLen) {}
   
   /**
    * Explains the score. The implementation here provides a basic explanation
@@ -116,18 +137,18 @@ public abstract class EasySimilarity ext
    * @param stats the corpus level statistics.
    * @param doc the document id.
    * @param freq the term frequency and its explanation.
-   * @param norm the current document's field norm.
+   * @param docLen the document length.
    * @return the explanation.
    */
   protected Explanation explain(
-      EasyStats stats, int doc, Explanation freq, byte norm) {
+      EasyStats stats, int doc, Explanation freq, int docLen) {
     Explanation result = new Explanation(); 
-    result.setValue(score(stats, freq.getValue(), norm));
+    result.setValue(score(stats, freq.getValue(), docLen));
     result.setDescription("score(" + getClass().getSimpleName() +
         ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:");
     result.addDetail(freq);
     
-    explain(result, stats, doc, freq.getValue(), norm);
+    explain(result, stats, doc, freq.getValue(), docLen);
     
     return result;
   }
@@ -148,24 +169,24 @@ public abstract class EasySimilarity ext
 
   // ------------------------------ Norm handling ------------------------------
   
-  /** Cache of decoded bytes. */
-  private static final float[] NORM_TABLE = new float[256];
+  /** Norm -> document length map. */
+  private static final int[] NORM_TABLE = new int[256];
 
   static {
-    for (int i = 0; i < 256; i++)
-      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+    for (int i = 0; i < 256; i++) {
+      float floatNorm = SmallFloat.byte315ToFloat((byte)i);
+      NORM_TABLE[i] = (int)(1.0 / (floatNorm * floatNorm));
+    }
   }
 
   /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
   @Override
   public byte computeNorm(FieldInvertState state) {
-    final int numTerms;
-    // nocommit: to include discountOverlaps?
-//    if (discountOverlaps)
-//      numTerms = state.getLength() - state.getNumOverlap();
-//    else
-      numTerms = state.getLength();
-//    return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
+    final float numTerms;
+    if (discountOverlaps)
+      numTerms = state.getLength() - state.getNumOverlap();
+    else
+      numTerms = state.getLength() / state.getBoost();
     return encodeNormValue(numTerms);
   }
   
@@ -173,15 +194,13 @@ public abstract class EasySimilarity ext
    * @see #encodeNormValue(float)
    */
   // nocommit to protected?
-  // nocommit is int OK?
   public int decodeNormValue(byte norm) {
-    float floatNorm = NORM_TABLE[norm & 0xFF];  // & 0xFF maps negative bytes to positive above 127
-    return (int)(1.0 / (floatNorm * floatNorm));  
+    return NORM_TABLE[norm & 0xFF];  // & 0xFF maps negative bytes to positive above 127
   }
   
   /** Encodes the length to a byte via SmallFloat. */
   // nocommit to protected?
-  public byte encodeNormValue(int length) {
+  public byte encodeNormValue(float length) {
     return SmallFloat.floatToByte315((float)(1.0 / Math.sqrt(length)));
   }
   
@@ -212,12 +231,13 @@ public abstract class EasySimilarity ext
     
     @Override
     public float score(int doc, int freq) {
-      return EasySimilarity.this.score(stats, freq, norms[doc]);
+      return EasySimilarity.this.score(stats, freq, decodeNormValue(norms[doc]));
     }
     
     @Override
     public Explanation explain(int doc, Explanation freq) {
-      return EasySimilarity.this.explain(stats, doc, freq, norms[doc]);
+      return EasySimilarity.this.explain(
+          stats, doc, freq, decodeNormValue(norms[doc]));
     }
   }
   
@@ -239,11 +259,12 @@ public abstract class EasySimilarity ext
     // todo: optimize
     @Override
     public float score(int doc, float freq) {
-      return EasySimilarity.this.score(stats, freq, norms[doc]);
+      return EasySimilarity.this.score(stats, freq, decodeNormValue(norms[doc]));
     }
     @Override
     public Explanation explain(int doc, Explanation freq) {
-      return EasySimilarity.this.explain(stats, doc, freq, norms[doc]);
+      return EasySimilarity.this.explain(
+          stats, doc, freq, decodeNormValue(norms[doc]));
     }
 
     @Override

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java Mon Aug  8 20:22:36 2011
@@ -63,22 +63,21 @@ public class IBSimilarity extends EasySi
   }
   
   @Override
-  protected float score(EasyStats stats, float freq, byte norm) {
+  protected float score(EasyStats stats, float freq, int docLen) {
     return stats.getTotalBoost() *
         distribution.score(
             stats,
-            normalization.tfn(stats, freq, decodeNormValue(norm)),
+            normalization.tfn(stats, freq, docLen),
             lambda.lambda(stats));
   }
 
   @Override
   protected void explain(
-      Explanation expl, EasyStats stats, int doc, float freq, byte norm) {
+      Explanation expl, EasyStats stats, int doc, float freq, int docLen) {
     if (stats.getTotalBoost() != 1.0f) {
       expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
     }
-    int len = decodeNormValue(norm);
-    Explanation normExpl = normalization.explain(stats, freq, len);
+    Explanation normExpl = normalization.explain(stats, freq, docLen);
     Explanation lambdaExpl = lambda.explain(stats);
     expl.addDetail(normExpl);
     expl.addDetail(lambdaExpl);

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMDirichletSimilarity.java Mon Aug  8 20:22:36 2011
@@ -54,16 +54,16 @@ public class LMDirichletSimilarity exten
   }
   
   @Override
-  protected float score(EasyStats stats, float freq, byte norm) {
+  protected float score(EasyStats stats, float freq, int docLen) {
     return stats.getTotalBoost() *
         (float)(Math.log(1 + freq /
             (mu * ((LMStats)stats).getCollectionProbability())) +
-        Math.log(mu / (decodeNormValue(norm) + mu)));
+        Math.log(mu / (docLen + mu)));
   }
   
   @Override
   protected void explain(Explanation expl, EasyStats stats, int doc,
-      float freq, byte norm) {
+      float freq, int docLen) {
     if (stats.getTotalBoost() != 1.0f) {
       expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
     }
@@ -75,8 +75,8 @@ public class LMDirichletSimilarity exten
     weightExpl.setDescription("term weight");
     expl.addDetail(weightExpl);
     expl.addDetail(new Explanation(
-        (float)Math.log(mu / (decodeNormValue(norm) + mu)), "document norm"));
-    super.explain(expl, stats, doc, freq, norm);
+        (float)Math.log(mu / (docLen + mu)), "document norm"));
+    super.explain(expl, stats, doc, freq, docLen);
   }
 
   /** Returns the &mu; parameter. */

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java Mon Aug  8 20:22:36 2011
@@ -26,6 +26,9 @@ import org.apache.lucene.search.similari
  * models applied to Ad Hoc information retrieval. In Proceedings of the 24th
  * annual international ACM SIGIR conference on Research and development in
  * information retrieval (SIGIR '01). ACM, New York, NY, USA, 334-342.
+ * <p>The model has a single parameter, &lambda;. According to said paper, the
+ * optimal value depends on both the collection and the query. The optimal value
+ * is around {@code 0.1} for title queries and {@code 0.7} for long queries.</p>
  *
  * @lucene.experimental
  */
@@ -46,20 +49,20 @@ public class LMJelinekMercerSimilarity e
   }
   
   @Override
-  protected float score(EasyStats stats, float freq, byte norm) {
+  protected float score(EasyStats stats, float freq, int docLen) {
     return stats.getTotalBoost() *
         (float)Math.log(1 +
-            ((1 - lambda) * freq / decodeNormValue(norm)) /
+            ((1 - lambda) * freq / docLen) /
             (lambda * ((LMStats)stats).getCollectionProbability()));
   }
   
   @Override
   protected void explain(Explanation expl, EasyStats stats, int doc,
-      float freq, byte norm) {
+      float freq, int docLen) {
     if (stats.getTotalBoost() != 1.0f) {
       expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
     }
-    super.explain(expl, stats, doc, freq, norm);
+    super.explain(expl, stats, doc, freq, docLen);
   }
 
   /** Returns the &lambda; parameter. */

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java?rev=1155078&r1=1155077&r2=1155078&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java Mon Aug  8 20:22:36 2011
@@ -67,7 +67,7 @@ public abstract class LMSimilarity exten
 
   @Override
   protected void explain(Explanation expl, EasyStats stats, int doc,
-      float freq, byte norm) {
+      float freq, int docLen) {
     expl.addDetail(new Explanation(collectionModel.computeProbability(stats),
                                    "collection probability"));
   }