You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/08/30 19:12:21 UTC
svn commit: r1163294 - in /lucene/dev/branches/flexscoring/lucene/src:
java/org/apache/lucene/search/similarities/
test-framework/org/apache/lucene/search/ test/org/apache/lucene/search/
test/org/apache/lucene/search/similarities/
Author: rmuir
Date: Tue Aug 30 17:12:20 2011
New Revision: 1163294
URL: http://svn.apache.org/viewvc?rev=1163294&view=rev
Log:
enable information-based models in tests
Modified:
lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java
lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java
lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java Tue Aug 30 17:12:20 2011
@@ -20,7 +20,7 @@ package org.apache.lucene.search.similar
import org.apache.lucene.search.Explanation;
/**
- * Model of the information gain based on the ration of two Bernoulli processes.
+ * Model of the information gain based on the ratio of two Bernoulli processes.
* @lucene.experimental
*/
public class AfterEffectB extends AfterEffect {
Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/BasicSimilarityProvider.java Tue Aug 30 17:12:20 2011
@@ -45,4 +45,9 @@ public class BasicSimilarityProvider imp
public Similarity get(String field) {
return sim;
}
+
+ @Override
+ public String toString() {
+ return "BasicSimilarityProvider(" + sim + ")";
+ }
}
\ No newline at end of file
Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java Tue Aug 30 17:12:20 2011
@@ -33,7 +33,7 @@ import org.apache.lucene.search.Explanat
* their counterparts in the Terrier IR engine.</p>
* <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
* is not handled by this implementation.</p>
- *
+ * <p>Note: DFR models currently do not support index-time boosting.
* @see BasicModel
* @see AfterEffect
* @see Normalization
@@ -58,31 +58,17 @@ public class DFRSimilarity extends Simil
this.normalization = normalization;
}
- /** Creates a DFR model with no normalization. */
- public DFRSimilarity(BasicModel basicModel,
- AfterEffect afterEffect) {
- this(basicModel, afterEffect, new Normalization.NoNormalization());
- }
-
- /** Creates a DFR model with no aftereffect. */
- public DFRSimilarity(BasicModel basicModel,
- Normalization normalization) {
- this(basicModel, new AfterEffect.NoAfterEffect(), normalization);
- }
-
- /** Creates a DFR model with only a basic model. */
- public DFRSimilarity(BasicModel basicModel) {
- this(basicModel,
- new AfterEffect.NoAfterEffect(),
- new Normalization.NoNormalization());
- }
-
@Override
protected float score(BasicStats stats, float freq, float docLen) {
float tfn = normalization.tfn(stats, freq, docLen);
return stats.getTotalBoost() *
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
}
+
+ @Override
+ protected boolean supportsIndexTimeBoost() {
+ return false;
+ }
@Override
protected void explain(Explanation expl,
Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/IBSimilarity.java Tue Aug 30 17:12:20 2011
@@ -57,11 +57,6 @@ public class IBSimilarity extends Simila
this.normalization = normalization;
}
- /** Creates an instance with no normalization. */
- public IBSimilarity(Distribution distribution, Lambda lambda) {
- this(distribution, lambda, new Normalization.NoNormalization());
- }
-
@Override
protected float score(BasicStats stats, float freq, float docLen) {
return stats.getTotalBoost() *
Modified: lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java Tue Aug 30 17:12:20 2011
@@ -66,6 +66,20 @@ public abstract class SimilarityBase ext
}
/**
+ * True if this implementation supports index-time boosting.
+ * <p>
+ * Note: although subclasses use the same length normalization encoding as
+ * Lucene's DefaultSimilarity, index-time boosting does not always work well: some
+ * implementing models may have more sophisticated normalizations (e.g. bernoulli
+ * aftereffect) that cannot be 'outsmarted' by making a document's length appear shorter.
+ * In these cases boosting a document higher may actually have the reverse effect,
+ * so subclasses can return false here so that the user will get an error instead.
+ */
+ protected boolean supportsIndexTimeBoost() {
+ return true;
+ }
+
+ /**
* Calls {@link #fillBasicStats(BasicStats, IndexSearcher, String, TermContext...)}.
* Subclasses that override this method may invoke {@code fillStats} with any
* subclass of {@code BasicStats}.
@@ -223,6 +237,9 @@ public abstract class SimilarityBase ext
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength() / state.getBoost();
+ if (!supportsIndexTimeBoost() && state.getBoost() != 1F) {
+ throw new UnsupportedOperationException("index-time boosting is not supported");
+ }
return encodeNormValue(state.getBoost(), numTerms);
}
@@ -267,13 +284,13 @@ public abstract class SimilarityBase ext
public float score(int doc, int freq) {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq,
- norms == null ? freq : decodeNormValue(norms[doc]));
+ norms == null ? 1F : decodeNormValue(norms[doc]));
}
@Override
public Explanation explain(int doc, Explanation freq) {
return SimilarityBase.this.explain(stats, doc, freq,
- norms == null ? freq.getValue() : decodeNormValue(norms[doc]));
+ norms == null ? 1F : decodeNormValue(norms[doc]));
}
}
@@ -296,12 +313,12 @@ public abstract class SimilarityBase ext
public float score(int doc, float freq) {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq,
- norms == null ? freq : decodeNormValue(norms[doc]));
+ norms == null ? 1F : decodeNormValue(norms[doc]));
}
@Override
public Explanation explain(int doc, Explanation freq) {
return SimilarityBase.this.explain(stats, doc, freq,
- norms == null ? freq.getValue() : decodeNormValue(norms[doc]));
+ norms == null ? 1F : decodeNormValue(norms[doc]));
}
@Override
Modified: lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test-framework/org/apache/lucene/search/RandomSimilarityProvider.java Tue Aug 30 17:12:20 2011
@@ -111,7 +111,9 @@ public class RandomSimilarityProvider ex
/** The DFR normalizations to test. */
static Normalization[] NORMALIZATIONS = {
new NormalizationH1(), new NormalizationH2(),
- new Normalization.NoNormalization()
+ // TODO: if we enable NoNormalization, we have to deal with
+ // a couple tests (e.g. TestDocBoost, TestSort) that expect length normalization
+ // new Normalization.NoNormalization()
};
/** The distributions for IB. */
static Distribution[] DISTRIBUTIONS = {
@@ -126,13 +128,14 @@ public class RandomSimilarityProvider ex
allSims = new ArrayList<Similarity>();
allSims.add(new DefaultSimilarity());
allSims.add(new BM25Similarity());
- /* TODO: enable all sims: for (BasicModel basicModel : BASIC_MODELS) {
+ /* TODO: enable DFR sims
+ for (BasicModel basicModel : BASIC_MODELS) {
for (AfterEffect afterEffect : AFTER_EFFECTS) {
for (Normalization normalization : NORMALIZATIONS) {
allSims.add(new DFRSimilarity(basicModel, afterEffect, normalization));
}
}
- }
+ } */
for (Distribution distribution : DISTRIBUTIONS) {
for (Lambda lambda : LAMBDAS) {
for (Normalization normalization : NORMALIZATIONS) {
@@ -140,9 +143,11 @@ public class RandomSimilarityProvider ex
}
}
}
+ /* TODO: enable LM sims
allSims.add(new LMDirichletSimilarity());
allSims.add(new LMJelinekMercerSimilarity(0.1f));
- allSims.add(new LMJelinekMercerSimilarity(0.7f)); */
+ allSims.add(new LMJelinekMercerSimilarity(0.7f));
+ */
}
@Override
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/TestDocBoost.java Tue Aug 30 17:12:20 2011
@@ -56,7 +56,8 @@ public class TestDocBoost extends Lucene
final float[] scores = new float[4];
- newSearcher(reader).search
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.search
(new TermQuery(new Term("field", "word")),
new Collector() {
private int base = 0;
@@ -82,7 +83,10 @@ public class TestDocBoost extends Lucene
float lastScore = 0.0f;
for (int i = 0; i < 2; i++) {
- assertTrue(scores[i] > lastScore);
+ if (VERBOSE) {
+ System.out.println(searcher.explain(new TermQuery(new Term("field", "word")), i));
+ }
+ assertTrue("score: " + scores[i] + " should be > lastScore: " + lastScore, scores[i] > lastScore);
lastScore = scores[i];
}
Modified: lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java?rev=1163294&r1=1163293&r2=1163294&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java (original)
+++ lucene/dev/branches/flexscoring/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java Tue Aug 30 17:12:20 2011
@@ -409,7 +409,7 @@ public class TestSimilarityBase extends
* no normalization.
*/
public void testLLForIB() throws IOException {
- SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF());
+ SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF(), new Normalization.NoNormalization());
correctnessTestCore(sim, 4.26267987704f);
}
@@ -419,7 +419,7 @@ public class TestSimilarityBase extends
*/
public void testSPLForIB() throws IOException {
SimilarityBase sim =
- new IBSimilarity(new DistributionSPL(), new LambdaTTF());
+ new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization());
correctnessTestCore(sim, 2.24069910825f);
}
@@ -475,7 +475,7 @@ public class TestSimilarityBase extends
/** Correctness test for the D DFR model (basic model only). */
public void testD() throws IOException {
- SimilarityBase sim = new DFRSimilarity(new BasicModelD());
+ SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099
double phi = FREQ / TOTAL_TERM_FREQ; // 0.1
double D = phi * SimilarityBase.log2(phi / p) + // 0.209745318365
@@ -488,7 +488,7 @@ public class TestSimilarityBase extends
/** Correctness test for the In2 DFR model with no aftereffect. */
public void testIn2() throws IOException {
SimilarityBase sim = new DFRSimilarity(
- new BasicModelIn(), new NormalizationH2());
+ new BasicModelIn(), new AfterEffect.NoAfterEffect(), new NormalizationH2());
float tfn = (float)(FREQ * SimilarityBase.log2( // 8.1894750101
1 + AVG_FIELD_LENGTH / DOC_LEN));
float gold = (float)(tfn * SimilarityBase.log2( // 26.7459577898
@@ -499,7 +499,7 @@ public class TestSimilarityBase extends
/** Correctness test for the IFB DFR model with no normalization. */
public void testIFB() throws IOException {
SimilarityBase sim = new DFRSimilarity(
- new BasicModelIF(), new AfterEffectB());
+ new BasicModelIF(), new AfterEffectB(), new Normalization.NoNormalization());
float B = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (FREQ + 1)); // 0.8875
float IF = (float)(FREQ * SimilarityBase.log2( // 8.97759389642
1 + (NUMBER_OF_DOCUMENTS + 1) / (TOTAL_TERM_FREQ + 0.5)));