You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/12/29 09:06:54 UTC
[3/4] lucene-solr:master: LUCENE-8010: Fix similarities so that they
pass tests.
LUCENE-8010: Fix similarities so that they pass tests.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/b2f24816
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/b2f24816
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/b2f24816
Branch: refs/heads/master
Commit: b2f248164c1a3ddf213a56778d55c9252a022f18
Parents: 8e439a0
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Dec 28 17:44:01 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Fri Dec 29 10:06:00 2017 +0100
----------------------------------------------------------------------
.../lucene/search/similarities/Axiomatic.java | 28 +++++++++++++++++++-
.../search/similarities/AxiomaticF1EXP.java | 2 +-
.../search/similarities/AxiomaticF1LOG.java | 2 +-
.../search/similarities/AxiomaticF3EXP.java | 2 +-
.../search/similarities/AxiomaticF3LOG.java | 2 +-
.../search/similarities/DistributionSPL.java | 26 +++++++++++++++---
.../lucene/search/similarities/LambdaDF.java | 7 ++++-
.../lucene/search/similarities/LambdaTTF.java | 7 ++++-
.../search/similarities/TestAxiomaticF1EXP.java | 4 ---
.../search/similarities/TestAxiomaticF1LOG.java | 4 ---
.../search/similarities/TestAxiomaticF3EXP.java | 7 +----
.../search/similarities/TestAxiomaticF3LOG.java | 7 +----
.../similarities/TestDistributionSPL.java | 4 ---
.../search/similarities/TestSimilarity2.java | 7 +++++
.../search/similarities/RandomSimilarity.java | 11 +++++---
15 files changed, 81 insertions(+), 39 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
index 403773e..553fd42 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
@@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
+import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.Explanation;
@@ -101,11 +102,13 @@ public abstract class Axiomatic extends SimilarityBase {
@Override
public double score(BasicStats stats, double freq, double docLen) {
- return tf(stats, freq, docLen)
+ double score = tf(stats, freq, docLen)
* ln(stats, freq, docLen)
* tfln(stats, freq, docLen)
* idf(stats, freq, docLen)
- gamma(stats, freq, docLen);
+ // AxiomaticF3 similarities might produce negative scores due to their gamma component
+ return Math.max(0, score);
}
@Override
@@ -115,6 +118,29 @@ public abstract class Axiomatic extends SimilarityBase {
}
@Override
+ protected Explanation explain(
+ BasicStats stats, int doc, Explanation freq, double docLen) {
+ List<Explanation> subs = new ArrayList<>();
+ explain(subs, stats, doc, freq.getValue(), docLen);
+
+ double score = tf(stats, freq.getValue(), docLen)
+ * ln(stats, freq.getValue(), docLen)
+ * tfln(stats, freq.getValue(), docLen)
+ * idf(stats, freq.getValue(), docLen)
+ - gamma(stats, freq.getValue(), docLen);
+
+ Explanation explanation = Explanation.match((float) score,
+ "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
+ subs);
+ if (score < 0) {
+ explanation = Explanation.match(0, "max of:",
+ Explanation.match(0, "Minimum legal score"),
+ explanation);
+ }
+ return explanation;
+ }
+
+ @Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java
index c026feb..ca5c42b 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1EXP.java
@@ -60,7 +60,7 @@ public class AxiomaticF1EXP extends Axiomatic {
*/
@Override
protected double tf(BasicStats stats, double freq, double docLen) {
- if (freq <= 0.0) return 0.0;
+ freq += 1; // otherwise gives negative scores for freqs < 1
return 1 + Math.log(1 + Math.log(freq));
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java
index 2e19255..6ef3587 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF1LOG.java
@@ -52,7 +52,7 @@ public class AxiomaticF1LOG extends Axiomatic {
*/
@Override
protected double tf(BasicStats stats, double freq, double docLen) {
- if (freq <= 0.0) return 0.0;
+ freq += 1; // otherwise gives negative scores for freqs < 1
return 1 + Math.log(1 + Math.log(freq));
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java
index 635dc68..a54c754 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3EXP.java
@@ -58,7 +58,7 @@ public class AxiomaticF3EXP extends Axiomatic {
*/
@Override
protected double tf(BasicStats stats, double freq, double docLen) {
- if (freq <= 0.0) return 0.0;
+ freq += 1; // otherwise gives negative scores for freqs < 1
return 1 + Math.log(1 + Math.log(freq));
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java
index 4753e4e..194b70a 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/AxiomaticF3LOG.java
@@ -47,7 +47,7 @@ public class AxiomaticF3LOG extends Axiomatic {
*/
@Override
protected double tf(BasicStats stats, double freq, double docLen) {
- if (freq <= 0.0) return 0.0;
+ freq += 1; // otherwise gives negative scores for freqs < 1
return 1 + Math.log(1 + Math.log(freq));
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
index fc05d72..2ab44df 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
@@ -34,11 +34,29 @@ public class DistributionSPL extends Distribution {
@Override
public final double score(BasicStats stats, double tfn, double lambda) {
- if (lambda == 1d) {
- lambda = 0.99d;
+ assert lambda != 1;
+
+ // tfn/(tfn+1) -> 1 - 1/(tfn+1), guaranteed to be non decreasing when tfn increases
+ double q = 1 - 1 / (tfn + 1);
+ if (q == 1) {
+ q = Math.nextDown(1.0);
+ }
+
+ double pow = Math.pow(lambda, q);
+ if (pow == lambda) {
+ // this can happen because of floating-point rounding
+ // but then we return infinity when taking the log, so we enforce
+ // that pow is different from lambda
+ if (lambda < 1) {
+ // x^y > x when x < 1 and y < 1
+ pow = Math.nextUp(lambda);
+ } else {
+ // x^y < x when x > 1 and y < 1
+ pow = Math.nextDown(lambda);
+ }
}
- return -Math.log(
- (Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda));
+
+ return -Math.log((pow - lambda) / (1 - lambda));
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaDF.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaDF.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaDF.java
index 7dc320d..6b7dbb2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaDF.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaDF.java
@@ -30,7 +30,12 @@ public class LambdaDF extends Lambda {
@Override
public final float lambda(BasicStats stats) {
- return (stats.getDocFreq()+1F) / (stats.getNumberOfDocuments()+1F);
+ float lambda = (float) ((stats.getDocFreq() + 1.0) / (stats.getNumberOfDocuments() + 1.0));
+ if (lambda == 1) {
+ // Distribution SPL cannot work with values of lambda that are equal to 1
+ lambda = Math.nextDown(lambda);
+ }
+ return lambda;
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaTTF.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
index 6dc54a3..72eae4c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
@@ -30,7 +30,12 @@ public class LambdaTTF extends Lambda {
@Override
public final float lambda(BasicStats stats) {
- return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfDocuments()+1F);
+ float lambda = (float) ((stats.getTotalTermFreq() + 1.0) / (stats.getNumberOfDocuments() + 1.0));
+ if (lambda == 1) {
+ // Distribution SPL cannot work with values of lambda that are equal to 1
+ lambda = Math.nextUp(lambda);
+ }
+ return lambda;
}
public final Explanation explain(BasicStats stats) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java
index 16da903..9e2edf1 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1EXP.java
@@ -16,10 +16,6 @@
*/
package org.apache.lucene.search.similarities;
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// returns NaN scores for sloppy freqs < 1 (due to log without floor)
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF1EXP extends AxiomaticTestCase {
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java
index 88ad18e..0d6ba48 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF1LOG.java
@@ -16,10 +16,6 @@
*/
package org.apache.lucene.search.similarities;
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// returns NaN scores for sloppy freqs < 1 (due to log without floor)
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF1LOG extends AxiomaticTestCase {
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java
index 69ab719..63ad87a 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3EXP.java
@@ -16,16 +16,11 @@
*/
package org.apache.lucene.search.similarities;
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// returns negative scores at least, but it (now) warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF3EXP extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
- // TODO: use the randomized parameters and not these hardcoded ones
- return new AxiomaticF3EXP(0.25f, 1);
+ return new AxiomaticF3EXP(s, queryLen);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java
index 6863277..cb92984 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestAxiomaticF3LOG.java
@@ -16,16 +16,11 @@
*/
package org.apache.lucene.search.similarities;
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// returns negative scores at least, but it (now) warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestAxiomaticF3LOG extends AxiomaticTestCase {
@Override
protected final Similarity getAxiomaticModel(float s, int queryLen, float k) {
- // TODO: use the randomized parameters and not these hardcoded ones
- return new AxiomaticF3LOG(0.25f, 1);
+ return new AxiomaticF3LOG(s, queryLen);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java
index 984915a..20a2b32 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestDistributionSPL.java
@@ -16,10 +16,6 @@
*/
package org.apache.lucene.search.similarities;
-import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
-
-// scores go infinite, but it warns it has problems
-@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-8010")
public class TestDistributionSPL extends DistributionTestCase {
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
index 6fd38bd..49dc154 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@@ -54,6 +54,13 @@ public class TestSimilarity2 extends LuceneTestCase {
sims = new ArrayList<>();
sims.add(new ClassicSimilarity());
sims.add(new BM25Similarity());
+ sims.add(new BooleanSimilarity());
+ sims.add(new AxiomaticF1EXP());
+ sims.add(new AxiomaticF1LOG());
+ sims.add(new AxiomaticF2EXP());
+ sims.add(new AxiomaticF2LOG());
+ sims.add(new AxiomaticF3EXP(0.25f, 3));
+ sims.add(new AxiomaticF3LOG(0.25f, 3));
// TODO: not great that we dup this all with TestSimilarityBase
for (BasicModel basicModel : TestSimilarityBase.BASIC_MODELS) {
for (AfterEffect afterEffect : TestSimilarityBase.AFTER_EFFECTS) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b2f24816/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
index 444e8ef..0925aee 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
@@ -89,8 +89,12 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
allSims = new ArrayList<>();
allSims.add(new ClassicSimilarity());
allSims.add(new BM25Similarity());
- // We cannot do this, because this similarity behaves in "non-traditional" ways:
- // allSims.add(new BooleanSimilarity());
+ allSims.add(new AxiomaticF1EXP());
+ allSims.add(new AxiomaticF1LOG());
+ allSims.add(new AxiomaticF2EXP());
+ allSims.add(new AxiomaticF2LOG());
+
+ allSims.add(new BooleanSimilarity());
for (BasicModel basicModel : BASIC_MODELS) {
for (AfterEffect afterEffect : AFTER_EFFECTS) {
for (Normalization normalization : NORMALIZATIONS) {
@@ -105,8 +109,7 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
}
}
}
- /* TODO: enable Dirichlet
- allSims.add(new LMDirichletSimilarity()); */
+ allSims.add(new LMDirichletSimilarity());
allSims.add(new LMJelinekMercerSimilarity(0.1f));
allSims.add(new LMJelinekMercerSimilarity(0.7f));
for (Independence independence : INDEPENDENCE_MEASURES) {