You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/05 15:14:48 UTC
svn commit: r1154202 - in
/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities:
DistributionLL.java DistributionSPL.java EasySimilarity.java EasyStats.java
IBSimilarity.java LMSimilarity.java
Author: rmuir
Date: Fri Aug 5 13:14:47 2011
New Revision: 1154202
URL: http://svn.apache.org/viewvc?rev=1154202&view=rev
Log:
LUCENE-3220: use same norm encoding as DefaultSimilarity
Modified:
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionLL.java Fri Aug 5 13:14:47 2011
@@ -24,7 +24,7 @@ package org.apache.lucene.search.similar
* preference to a specific base.</p>
* @lucene.experimental
*/
-public abstract class DistributionLL extends Distribution {
+public class DistributionLL extends Distribution {
@Override
public final float score(EasyStats stats, float tfn, float lambda) {
return (float)-Math.log(lambda / (tfn + lambda));
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/DistributionSPL.java Fri Aug 5 13:14:47 2011
@@ -25,7 +25,7 @@ package org.apache.lucene.search.similar
* preference to a specific base.</p>
* @lucene.experimental
*/
-public abstract class DistributionSPL extends Distribution {
+public class DistributionSPL extends Distribution {
@Override
public final float score(EasyStats stats, float tfn, float lambda) {
return (float)-Math.log(
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasySimilarity.java Fri Aug 5 13:14:47 2011
@@ -57,10 +57,9 @@ public abstract class EasySimilarity ext
String fieldName, TermContext... termContexts) throws IOException {
IndexReader reader = searcher.getIndexReader();
int numberOfDocuments = reader.maxDoc();
- long sumTotalTermFreq = MultiFields.getTerms(searcher.getIndexReader(),
+ long numberOfFieldTokens = MultiFields.getTerms(searcher.getIndexReader(),
fieldName).getSumTotalTermFreq();
- long numberOfFieldTokens = sumTotalTermFreq; // nocommit: these are the same stat?
- float avgFieldLength = (float)sumTotalTermFreq / numberOfDocuments;
+ float avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
// nocommit This is for phrases, and it doesn't really work... have to
// find a method that makes sense
@@ -76,33 +75,9 @@ public abstract class EasySimilarity ext
stats.setAvgFieldLength(avgFieldLength);
stats.setDocFreq(docFreq);
stats.setTotalTermFreq(totalTermFreq);
- stats.setSumTotalTermFreq(sumTotalTermFreq);
// nocommit uniqueTermCount? (LUCENE-3290)
}
- /** Encodes the document length. */
- @Override
- public byte computeNorm(FieldInvertState state) {
- return encodeNormValue(state.getLength());
- }
-
- /** Decodes a normalization factor stored in an index.
- * @see #encodeNormValue(float)
- */
- // nocommit to protected?
- // nocommit is int OK?
- public int decodeNormValue(byte norm) {
- // SmallFloat seems OK, because tf is smoothed anyway.
- return (int)SmallFloat.byte315ToFloat(norm);
- }
-
- /** Encodes the length to a byte via SmallInt. */
- // nocommit to protected?
- public byte encodeNormValue(int length) {
- // SmallFloat seems OK, because tf is smoothed anyway.
- return SmallFloat.floatToByte315(length);
- }
-
/**
* Scores the document {@code doc}.
* <p>Subclasses must apply their scoring formula in this class.</p>
@@ -170,6 +145,45 @@ public abstract class EasySimilarity ext
return new EasySloppyDocScorer((EasyStats) stats,
context.reader.norms(fieldName));
}
+
+ // ------------------------------ Norm handling ------------------------------
+
+ /** Cache of decoded bytes. */
+ private static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++)
+ NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+ }
+
+ /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
+ @Override
+ public byte computeNorm(FieldInvertState state) {
+ final int numTerms;
+ // nocommit: to include discountOverlaps?
+// if (discountOverlaps)
+// numTerms = state.getLength() - state.getNumOverlap();
+// else
+ numTerms = state.getLength();
+// return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
+ return encodeNormValue(numTerms);
+ }
+
+ /** Decodes a normalization factor (document length) stored in an index.
+ * @see #encodeNormValue(float)
+ */
+ // nocommit to protected?
+ // nocommit is int OK?
+ public int decodeNormValue(byte norm) {
+ float floatNorm = NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
+ return (int)(1.0 / (floatNorm * floatNorm));
+ }
+
+ /** Encodes the length to a byte via SmallFloat. */
+ // nocommit to protected?
+ public byte encodeNormValue(int length) {
+ return SmallFloat.floatToByte315((float)(1.0 / Math.sqrt(length)));
+ }
// ----------------------------- Static methods ------------------------------
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/EasyStats.java Fri Aug 5 13:14:47 2011
@@ -17,6 +17,8 @@ package org.apache.lucene.search.similar
* limitations under the License.
*/
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
import org.apache.lucene.search.Similarity;
/**
@@ -33,11 +35,7 @@ public class EasyStats extends Similarit
/** The document frequency. */
protected int docFreq;
/** The total number of occurrences of this term across all documents. */
- // TODO: same field?
protected long totalTermFreq;
- /** The total number of terms across all documents. */
- // TODO: same field?
- protected long sumTotalTermFreq;
/** The number of unique terms. */
// nocommit might be per-segment only
protected long uniqueTermCount;
@@ -70,12 +68,18 @@ public class EasyStats extends Similarit
this.numberOfDocuments = numberOfDocuments;
}
- /** Returns the total number of tokens in the field. */
+ /**
+ * Returns the total number of tokens in the field.
+ * @see Terms#getSumTotalTermFreq()
+ */
public long getNumberOfFieldTokens() {
return numberOfFieldTokens;
}
- /** Sets the total number of tokens in the field. */
+ /**
+ * Sets the total number of tokens in the field.
+ * @see Terms#getSumTotalTermFreq()
+ */
public void setNumberOfFieldTokens(long numberOfFieldTokens) {
this.numberOfFieldTokens = numberOfFieldTokens;
}
@@ -110,16 +114,6 @@ public class EasyStats extends Similarit
this.totalTermFreq = totalTermFreq;
}
- /** Returns the total number of terms across all documents. */
- public long getSumTotalTermFreq() {
- return sumTotalTermFreq;
- }
-
- /** Sets the total number of terms across all documents. */
- public void setSumTotalTermFreq(long sumTotalTermFreq) {
- this.sumTotalTermFreq = sumTotalTermFreq;
- }
-
/** Returns the number of unique terms. */
public long getUniqueTermCount() {
return uniqueTermCount;
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/IBSimilarity.java Fri Aug 5 13:14:47 2011
@@ -49,15 +49,17 @@ public class IBSimilarity extends EasySi
/** The term frequency normalization. */
protected final Normalization normalization;
- public IBSimilarity(Class<Distribution> distributionClass,
- Class<Lambda> lambdaClass,
- Class<Normalization> normalizationClass)
- throws InstantiationException, IllegalAccessException {
- distribution = distributionClass.newInstance();
- lambda = lambdaClass.newInstance();
- normalization = (normalizationClass != null)
- ? normalizationClass.newInstance()
- : new Normalization.NoNormalization();
+ public IBSimilarity(Distribution distribution,
+ Lambda lambda,
+ Normalization normalization) {
+ this.distribution = distribution;
+ this.lambda = lambda;
+ this.normalization = normalization;
+ }
+
+ /** Creates an instance with no normalization. */
+ public IBSimilarity(Distribution distribution, Lambda lambda) {
+ this(distribution, lambda, new Normalization.NoNormalization());
}
@Override
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java?rev=1154202&r1=1154201&r2=1154202&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/LMSimilarity.java Fri Aug 5 13:14:47 2011
@@ -114,7 +114,7 @@ public abstract class LMSimilarity exten
public static class DefaultCollectionModel implements CollectionModel {
@Override
public float computeProbability(EasyStats stats) {
- return (float)stats.getTotalTermFreq() / stats.getSumTotalTermFreq();
+ return (float)stats.getTotalTermFreq() / stats.getNumberOfFieldTokens();
}
}
}