You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/05 15:14:48 UTC

svn commit: r1154202 - in /lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities: DistributionLL.java DistributionSPL.java EasySimilarity.java EasyStats.java IBSimilarity.java LMSimilarity.java

Author: rmuir
Date: Fri Aug  5 13:14:47 2011
New Revision: 1154202

URL: http://svn.apache.org/viewvc?rev=1154202&view=rev
Log:
LUCENE-3220: use same norm encoding as DefaultSimilarity

Modified:
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
    lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java Fri Aug  5 13:14:47 2011
@@ -24,7 +24,7 @@ package org.apache.lucene.search.similar
  * preference to a specific base.</p>
  * @lucene.experimental
  */
-public abstract class DistributionLL extends Distribution {
+public class DistributionLL extends Distribution {
   @Override
   public final float score(EasyStats stats, float tfn, float lambda) {
     return (float)-Math.log(lambda / (tfn + lambda));

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java Fri Aug  5 13:14:47 2011
@@ -25,7 +25,7 @@ package org.apache.lucene.search.similar
  * preference to a specific base.</p>
  * @lucene.experimental
  */
-public abstract class DistributionSPL extends Distribution {
+public class DistributionSPL extends Distribution {
   @Override
   public final float score(EasyStats stats, float tfn, float lambda) {
     return (float)-Math.log(

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java Fri Aug  5 13:14:47 2011
@@ -57,10 +57,9 @@ public abstract class EasySimilarity ext
       String fieldName, TermContext... termContexts) throws IOException {
     IndexReader reader = searcher.getIndexReader();
     int numberOfDocuments = reader.maxDoc();
-    long sumTotalTermFreq = MultiFields.getTerms(searcher.getIndexReader(),
+    long numberOfFieldTokens = MultiFields.getTerms(searcher.getIndexReader(),
         fieldName).getSumTotalTermFreq();
-    long numberOfFieldTokens = sumTotalTermFreq; // nocommit: these are the same stat?
-    float avgFieldLength = (float)sumTotalTermFreq / numberOfDocuments;
+    float avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
     
     // nocommit This is for phrases, and it doesn't really work... have to
     // find a method that makes sense
@@ -76,33 +75,9 @@ public abstract class EasySimilarity ext
     stats.setAvgFieldLength(avgFieldLength);
     stats.setDocFreq(docFreq);
     stats.setTotalTermFreq(totalTermFreq);
-    stats.setSumTotalTermFreq(sumTotalTermFreq);
     // nocommit uniqueTermCount? (LUCENE-3290)
   }
   
-  /** Encodes the document length. */
-  @Override
-  public byte computeNorm(FieldInvertState state) {
-    return encodeNormValue(state.getLength());
-  }
-  
-  /** Decodes a normalization factor stored in an index.
-   * @see #encodeNormValue(float)
-   */
-  // nocommit to protected?
-  // nocommit is int OK?
-  public int decodeNormValue(byte norm) {
-    // SmallFloat seems OK, because tf is smoothed anyway.
-    return (int)SmallFloat.byte315ToFloat(norm);
-  }
-  
-  /** Encodes the length to a byte via SmallInt. */
-  // nocommit to protected?
-  public byte encodeNormValue(int length) {
-    // SmallFloat seems OK, because tf is smoothed anyway.
-    return SmallFloat.floatToByte315(length);
-  }
-  
   /**
    * Scores the document {@code doc}.
    * <p>Subclasses must apply their scoring formula in this class.</p>
@@ -170,6 +145,45 @@ public abstract class EasySimilarity ext
     return new EasySloppyDocScorer((EasyStats) stats,
                                    context.reader.norms(fieldName));
   }
+
+  // ------------------------------ Norm handling ------------------------------
+  
+  /** Cache of decoded bytes. */
+  private static final float[] NORM_TABLE = new float[256];
+
+  static {
+    for (int i = 0; i < 256; i++)
+      NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+  }
+
+  /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
+  @Override
+  public byte computeNorm(FieldInvertState state) {
+    final int numTerms;
+    // nocommit: to include discountOverlaps?
+//    if (discountOverlaps)
+//      numTerms = state.getLength() - state.getNumOverlap();
+//    else
+      numTerms = state.getLength();
+//    return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
+    return encodeNormValue(numTerms);
+  }
+  
+  /** Decodes a normalization factor (document length) stored in an index.
+   * @see #encodeNormValue(float)
+   */
+  // nocommit to protected?
+  // nocommit is int OK?
+  public int decodeNormValue(byte norm) {
+    float floatNorm = NORM_TABLE[norm & 0xFF];  // & 0xFF maps negative bytes to positive above 127
+    return (int)(1.0 / (floatNorm * floatNorm));  
+  }
+  
+  /** Encodes the length to a byte via SmallFloat. */
+  // nocommit to protected?
+  public byte encodeNormValue(int length) {
+    return SmallFloat.floatToByte315((float)(1.0 / Math.sqrt(length)));
+  }
   
   // ----------------------------- Static methods ------------------------------
   

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java Fri Aug  5 13:14:47 2011
@@ -17,6 +17,8 @@ package org.apache.lucene.search.similar
  * limitations under the License.
  */
 
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.search.Similarity;
 
 /**
@@ -33,11 +35,7 @@ public class EasyStats extends Similarit
   /** The document frequency. */
   protected int docFreq;
   /** The total number of occurrences of this term across all documents. */
-  // TODO: same field?
   protected long totalTermFreq;
-  /** The total number of terms across all documents. */
-  // TODO: same field?
-  protected long sumTotalTermFreq;
   /** The number of unique terms. */
   // nocommit might be per-segment only
   protected long uniqueTermCount;
@@ -70,12 +68,18 @@ public class EasyStats extends Similarit
     this.numberOfDocuments = numberOfDocuments;
   }
   
-  /** Returns the total number of tokens in the field. */
+  /**
+   * Returns the total number of tokens in the field.
+   * @see Terms#getSumTotalTermFreq()
+   */
   public long getNumberOfFieldTokens() {
     return numberOfFieldTokens;
   }
   
-  /** Sets the total number of tokens in the field. */
+  /**
+   * Sets the total number of tokens in the field.
+   * @see Terms#getSumTotalTermFreq()
+   */
   public void setNumberOfFieldTokens(long numberOfFieldTokens) {
     this.numberOfFieldTokens = numberOfFieldTokens;
   }
@@ -110,16 +114,6 @@ public class EasyStats extends Similarit
     this.totalTermFreq = totalTermFreq;
   }
   
-  /** Returns the total number of terms across all documents. */
-  public long getSumTotalTermFreq() {
-    return sumTotalTermFreq;
-  }
-  
-  /** Sets the total number of terms across all documents. */
-  public void setSumTotalTermFreq(long sumTotalTermFreq) {
-    this.sumTotalTermFreq = sumTotalTermFreq;
-  }
-  
   /** Returns the number of unique terms. */
   public long getUniqueTermCount() {
     return uniqueTermCount;

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java Fri Aug  5 13:14:47 2011
@@ -49,15 +49,17 @@ public class IBSimilarity extends EasySi
   /** The term frequency normalization. */
   protected final Normalization normalization;
   
-  public IBSimilarity(Class<Distribution> distributionClass,
-                      Class<Lambda> lambdaClass,
-                      Class<Normalization> normalizationClass)
-  throws InstantiationException, IllegalAccessException {
-    distribution = distributionClass.newInstance();
-    lambda = lambdaClass.newInstance();
-    normalization = (normalizationClass != null)
-                  ? normalizationClass.newInstance()
-                  : new Normalization.NoNormalization();
+  public IBSimilarity(Distribution distribution,
+                      Lambda lambda,
+                      Normalization normalization) {
+    this.distribution = distribution;
+    this.lambda = lambda;
+    this.normalization = normalization;
+  }
+  
+  /** Creates an instance with no normalization. */
+  public IBSimilarity(Distribution distribution, Lambda lambda) {
+    this(distribution, lambda, new Normalization.NoNormalization());
   }
   
   @Override

Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java Fri Aug  5 13:14:47 2011
@@ -114,7 +114,7 @@ public abstract class LMSimilarity exten
   public static class DefaultCollectionModel implements CollectionModel {
     @Override
     public float computeProbability(EasyStats stats) {
-      return (float)stats.getTotalTermFreq() / stats.getSumTotalTermFreq();
+      return (float)stats.getTotalTermFreq() / stats.getNumberOfFieldTokens();
     }
   }
 }