You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/30 19:12:21 UTC

svn commit: r1163294 - in /lucene/dev/branches/flexscoring/lucene/src: java/org/apache/lucene/search/similarities/ test-framework/org/apache/lucene/search/ test/org/apache/lucene/search/ test/org/apache/lucene/search/similarities/

Author: rmuir
Date: Tue Aug 30 17:12:20 2011
New Revision: 1163294

URL: http://svn.apache.org/viewvc?rev=1163294&view=rev
Log:
enable information-based models in tests

Modified:
    lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
    lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java
    lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
    lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java

Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java Tue Aug 30 17:12:20 2011
@@ -20,7 +20,7 @@ package org.apache.lucene.search.similar
 import org.apache.lucene.search.Explanation;
 
 /**
- * Model of the information gain based on the ration of two Bernoulli processes.
+ * Model of the information gain based on the ratio of two Bernoulli processes.
  * @lucene.experimental
  */
 public class AfterEffectB extends AfterEffect {

Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java Tue Aug 30 17:12:20 2011
@@ -45,4 +45,9 @@ public class BasicSimilarityProvider imp
   public Similarity get(String field) {
     return sim;
   }
+
+  @Override
+  public String toString() {
+    return "BasicSimilarityProvider(" + sim + ")";
+  }
 }
\ No newline at end of file

Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java Tue Aug 30 17:12:20 2011
@@ -33,7 +33,7 @@ import org.apache.lucene.search.Explanat
  * their counterparts in the Terrier IR engine.</p>
  * <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
  * is not handled by this implementation.</p>
- * 
+ * <p>Note: DFR models currently do not support index-time boosting.
  * @see BasicModel
  * @see AfterEffect
  * @see Normalization
@@ -58,31 +58,17 @@ public class DFRSimilarity extends Simil
     this.normalization = normalization;
   }
 
-  /** Creates a DFR model with no normalization. */
-  public DFRSimilarity(BasicModel basicModel,
-                       AfterEffect afterEffect) {
-    this(basicModel, afterEffect, new Normalization.NoNormalization());
-  }
-  
-  /** Creates a DFR model with no aftereffect. */
-  public DFRSimilarity(BasicModel basicModel,
-                       Normalization normalization) {
-    this(basicModel, new AfterEffect.NoAfterEffect(), normalization);
-  }
-  
-  /** Creates a DFR model with only a basic model. */
-  public DFRSimilarity(BasicModel basicModel) {
-    this(basicModel,
-         new AfterEffect.NoAfterEffect(),
-         new Normalization.NoNormalization());
-  }
-  
   @Override
   protected float score(BasicStats stats, float freq, float docLen) {
     float tfn = normalization.tfn(stats, freq, docLen);
     return stats.getTotalBoost() *
         basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
   }
+  
+  @Override
+  protected boolean supportsIndexTimeBoost() {
+    return false;
+  }
 
   @Override
   protected void explain(Explanation expl,

Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java Tue Aug 30 17:12:20 2011
@@ -57,11 +57,6 @@ public class IBSimilarity extends Simila
     this.normalization = normalization;
   }
   
-  /** Creates an instance with no normalization. */
-  public IBSimilarity(Distribution distribution, Lambda lambda) {
-    this(distribution, lambda, new Normalization.NoNormalization());
-  }
-  
   @Override
   protected float score(BasicStats stats, float freq, float docLen) {
     return stats.getTotalBoost() *

Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java Tue Aug 30 17:12:20 2011
@@ -66,6 +66,20 @@ public abstract class SimilarityBase ext
   }
   
   /**
+   * True if this implementation supports index-time boosting.
+   * <p> 
+   * Note: although subclasses use the same length normalization encoding as 
+   * Lucene's DefaultSimilarity, index-time boosting does not always work well: some
+   * implementing models may have more sophisticated normalizations (e.g. bernoulli
+   * aftereffect) that cannot be 'outsmarted' by making a document's length appear shorter.
+   * In these cases boosting a document higher may actually have the reverse effect,
+   * so subclasses can return false here so that the user will get an error instead.
+   */
+  protected boolean supportsIndexTimeBoost() {
+    return true;
+  }
+  
+  /**
    * Calls {@link #fillBasicStats(BasicStats, IndexSearcher, String, TermContext...)}.
    * Subclasses that override this method may invoke {@code fillStats} with any
    * subclass of {@code BasicStats}.
@@ -223,6 +237,9 @@ public abstract class SimilarityBase ext
       numTerms = state.getLength() - state.getNumOverlap();
     else
       numTerms = state.getLength() / state.getBoost();
+    if (!supportsIndexTimeBoost() && state.getBoost() != 1F) {
+      throw new UnsupportedOperationException("index-time boosting is not supported");
+    }
     return encodeNormValue(state.getBoost(), numTerms);
   }
   
@@ -267,13 +284,13 @@ public abstract class SimilarityBase ext
     public float score(int doc, int freq) {
       // We have to supply something in case norms are omitted
       return SimilarityBase.this.score(stats, freq,
-          norms == null ? freq : decodeNormValue(norms[doc]));
+          norms == null ? 1F : decodeNormValue(norms[doc]));
     }
     
     @Override
     public Explanation explain(int doc, Explanation freq) {
       return SimilarityBase.this.explain(stats, doc, freq,
-          norms == null ? freq.getValue() : decodeNormValue(norms[doc]));
+          norms == null ? 1F : decodeNormValue(norms[doc]));
     }
   }
   
@@ -296,12 +313,12 @@ public abstract class SimilarityBase ext
     public float score(int doc, float freq) {
       // We have to supply something in case norms are omitted
       return SimilarityBase.this.score(stats, freq,
-          norms == null ? freq : decodeNormValue(norms[doc]));
+          norms == null ? 1F : decodeNormValue(norms[doc]));
     }
     @Override
     public Explanation explain(int doc, Explanation freq) {
       return SimilarityBase.this.explain(stats, doc, freq,
-          norms == null ? freq.getValue() : decodeNormValue(norms[doc]));
+          norms == null ? 1F : decodeNormValue(norms[doc]));
     }
 
     @Override

Modified: lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java Tue Aug 30 17:12:20 2011
@@ -111,7 +111,9 @@ public class RandomSimilarityProvider ex
   /** The DFR normalizations to test. */
   static Normalization[] NORMALIZATIONS = {
     new NormalizationH1(), new NormalizationH2(),
-    new Normalization.NoNormalization()
+    // TODO: if we enable NoNormalization, we have to deal with
+    // a couple tests (e.g. TestDocBoost, TestSort) that expect length normalization
+    // new Normalization.NoNormalization()
   };
   /** The distributions for IB. */
   static Distribution[] DISTRIBUTIONS = {
@@ -126,13 +128,14 @@ public class RandomSimilarityProvider ex
     allSims = new ArrayList<Similarity>();
     allSims.add(new DefaultSimilarity());
     allSims.add(new BM25Similarity());
-    /* TODO: enable all sims: for (BasicModel basicModel : BASIC_MODELS) {
+    /* TODO: enable DFR sims
+    for (BasicModel basicModel : BASIC_MODELS) {
       for (AfterEffect afterEffect : AFTER_EFFECTS) {
         for (Normalization normalization : NORMALIZATIONS) {
           allSims.add(new DFRSimilarity(basicModel, afterEffect, normalization));
         }
       }
-    }
+    } */
     for (Distribution distribution : DISTRIBUTIONS) {
       for (Lambda lambda : LAMBDAS) {
         for (Normalization normalization : NORMALIZATIONS) {
@@ -140,9 +143,11 @@ public class RandomSimilarityProvider ex
         }
       }
     }
+    /* TODO: enable LM sims
     allSims.add(new LMDirichletSimilarity());
     allSims.add(new LMJelinekMercerSimilarity(0.1f));
-    allSims.add(new LMJelinekMercerSimilarity(0.7f)); */
+    allSims.add(new LMJelinekMercerSimilarity(0.7f));
+    */
   }
   
   @Override

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java Tue Aug 30 17:12:20 2011
@@ -56,7 +56,8 @@ public class TestDocBoost extends Lucene
 
     final float[] scores = new float[4];
 
-    newSearcher(reader).search
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.search
       (new TermQuery(new Term("field", "word")),
        new Collector() {
          private int base = 0;
@@ -82,7 +83,10 @@ public class TestDocBoost extends Lucene
     float lastScore = 0.0f;
 
     for (int i = 0; i < 2; i++) {
-      assertTrue(scores[i] > lastScore);
+      if (VERBOSE) {
+        System.out.println(searcher.explain(new TermQuery(new Term("field", "word")), i));
+      }
+      assertTrue("score: " + scores[i] + " should be > lastScore: " + lastScore, scores[i] > lastScore);
       lastScore = scores[i];
     }
     

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java Tue Aug 30 17:12:20 2011
@@ -409,7 +409,7 @@ public class TestSimilarityBase extends 
    * no normalization.
    */
   public void testLLForIB() throws IOException {
-    SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF());
+    SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF(), new Normalization.NoNormalization());
     correctnessTestCore(sim, 4.26267987704f);
   }
   
@@ -419,7 +419,7 @@ public class TestSimilarityBase extends 
    */
   public void testSPLForIB() throws IOException {
     SimilarityBase sim =
-      new IBSimilarity(new DistributionSPL(), new LambdaTTF());
+      new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization());
     correctnessTestCore(sim, 2.24069910825f);
   }
   
@@ -475,7 +475,7 @@ public class TestSimilarityBase extends 
 
   /** Correctness test for the D DFR model (basic model only). */
   public void testD() throws IOException {
-    SimilarityBase sim = new DFRSimilarity(new BasicModelD());
+    SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
     double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1);                // 0.009900990099
     double phi = FREQ / TOTAL_TERM_FREQ;                       // 0.1
     double D = phi * SimilarityBase.log2(phi / p) +            // 0.209745318365
@@ -488,7 +488,7 @@ public class TestSimilarityBase extends 
   /** Correctness test for the In2 DFR model with no aftereffect. */
   public void testIn2() throws IOException {
     SimilarityBase sim = new DFRSimilarity(
-        new BasicModelIn(), new NormalizationH2());
+        new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2());
     float tfn = (float)(FREQ * SimilarityBase.log2(            // 8.1894750101
                 1 + AVG_FIELD_LENGTH / DOC_LEN));
     float gold = (float)(tfn * SimilarityBase.log2(            // 26.7459577898
@@ -499,7 +499,7 @@ public class TestSimilarityBase extends 
   /** Correctness test for the IFB DFR model with no normalization. */
   public void testIFB() throws IOException {
     SimilarityBase sim = new DFRSimilarity(
-        new BasicModelIF(), new AfterEffectB());
+        new BasicModelIF(), new AfterEffectB(), new Normalization.NoNormalization());
     float B = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (FREQ + 1)); // 0.8875
     float IF = (float)(FREQ * SimilarityBase.log2(             // 8.97759389642
                1 + (NUMBER_OF_DOCUMENTS + 1) / (TOTAL_TERM_FREQ + 0.5)));