You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/19 13:30:53 UTC
[lucene] branch branch_9x updated: LUCENE-10153: Improve accuracy of scaled scores in WANDScorer. (#794)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new db0e712cad3 LUCENE-10153: Improve accuracy of scaled scores in WANDScorer. (#794)
db0e712cad3 is described below
commit db0e712cad3a23e58a66c7c3aa1a9a8b0e217823
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Apr 19 15:26:24 2022 +0200
LUCENE-10153: Improve accuracy of scaled scores in WANDScorer. (#794)
---
lucene/CHANGES.txt | 3 ++
.../lucene/search/MaxScoreSumPropagator.java | 2 +-
.../java/org/apache/lucene/search/WANDScorer.java | 50 +++++++++++-----------
.../org/apache/lucene/search/TestWANDScorer.java | 13 +++++-
4 files changed, 41 insertions(+), 27 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 065ff6ac558..32f2cb38007 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -59,6 +59,9 @@ Optimizations
* LUCENE-10481: FacetsCollector will not request scores if it does not use them. (Mike Drob)
+* LUCENE-10153: Potential speedup for pure disjunctions whose clauses produce
+ scores that are very close to each other. (Adrien Grand)
+
Bug Fixes
---------------------
* LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite
diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java
index 1557cd9ae88..e224d729555 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java
@@ -166,7 +166,7 @@ final class MaxScoreSumPropagator {
return Math.max(minScore, 0f);
}
- private float scoreSumUpperBound(double sum) {
+ float scoreSumUpperBound(double sum) {
if (numClauses <= 2) {
// When there are only two clauses, the sum is always the same regardless
// of the order.
diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
index 3a655fa153f..a6ba7b0d29f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
@@ -25,7 +25,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
-import java.util.OptionalInt;
/**
* This implements the WAND (Weak AND) algorithm for dynamic pruning described in "Efficient Query
@@ -53,9 +52,12 @@ import java.util.OptionalInt;
*/
final class WANDScorer extends Scorer {
+ static final int FLOAT_MANTISSA_BITS = 24;
+ private static final long MAX_SCALED_SCORE = (1L << 24) - 1;
+
/**
* Return a scaling factor for the given float so that {@code f x 2^scalingFactor} would be in
- * {@code ]2^15, 2^16]}. Special cases:
+ * {@code [2^23, 2^24[}. Special cases:
*
* <pre>
* scalingFactor(0) = scalingFactor(MIN_VALUE) - 1
@@ -74,7 +76,7 @@ final class WANDScorer extends Scorer {
// Since doubles have more amplitude than floats for the
// exponent, the cast produces a normal value.
assert d == 0 || Math.getExponent(d) >= Double.MIN_EXPONENT; // normal double
- return 15 - Math.getExponent(Math.nextDown(d));
+ return FLOAT_MANTISSA_BITS - 1 - Math.getExponent(d);
}
}
@@ -83,23 +85,21 @@ final class WANDScorer extends Scorer {
* are used) as well as floating-point arithmetic errors. Those are rounded up in order to make
* sure we do not miss any matches.
*/
- private static long scaleMaxScore(float maxScore, int scalingFactor) {
+ static long scaleMaxScore(float maxScore, int scalingFactor) {
+ assert scalingFactor(maxScore) >= scalingFactor;
assert Float.isNaN(maxScore) == false;
assert maxScore >= 0;
// NOTE: because doubles have more amplitude than floats for the
// exponent, the scalb call produces an accurate value.
- double scaled = Math.scalb((double) maxScore, scalingFactor);
-
- if (scaled > 1 << 16) {
- // This happens if either maxScore is +Infty, or we have a scorer that
- // returned +Infty as its maximum score over the whole range of doc IDs
- // when computing the scaling factor in the constructor, and now returned
- // a finite maximum score for a smaller range of doc IDs.
- return (1L << 32) - 1; // means +Infinity in practice for this scorer
+ final double scaled = Math.scalb((double) maxScore, scalingFactor);
+
+ if (scaled > MAX_SCALED_SCORE) {
+ // This happens if one scorer returns +Infty as a max score
+ return MAX_SCALED_SCORE;
}
- return (long) Math.ceil(scaled); // round up, cast is accurate since value is <= 2^16
+ return (long) Math.ceil(scaled); // round up, cast is accurate since value is < 2^24
}
/**
@@ -107,7 +107,7 @@ final class WANDScorer extends Scorer {
* to make sure that we do not miss any matches.
*/
private static long scaleMinScore(float minScore, int scalingFactor) {
- assert Float.isNaN(minScore) == false;
+ assert Float.isFinite(minScore);
assert minScore >= 0;
// like for scaleMaxScore, this scalb call is accurate
@@ -171,21 +171,23 @@ final class WANDScorer extends Scorer {
tail = new DisiWrapper[scorers.size()];
if (this.scoreMode == ScoreMode.TOP_SCORES) {
- OptionalInt scalingFactor = OptionalInt.empty();
+ // To avoid accuracy issues with floating-point numbers, this scorer operates on scaled longs.
+ // How do you choose the scaling factor? The thing is that we want to retain as many
+ // significant bits as possible, but not too many, otherwise operations on longs would be more
+ // precise than the equivalent operations on their unscaled counterparts and we might skip too
+ // many hits. So we compute the maximum possible score produced by this scorer, which is the
+ // sum of the maximum scores of each clause, and compute a scaling factor that would preserve
+ // 24 bits of accuracy - the number of mantissa bits of single-precision floating-point
+ // numbers.
+ double maxScoreSumDouble = 0;
for (Scorer scorer : scorers) {
scorer.advanceShallow(0);
float maxScore = scorer.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
- if (maxScore != 0 && Float.isFinite(maxScore)) {
- // 0 and +Infty should not impact the scale
- scalingFactor =
- OptionalInt.of(
- Math.min(scalingFactor.orElse(Integer.MAX_VALUE), scalingFactor(maxScore)));
- }
+ maxScoreSumDouble += maxScore;
}
-
- // Use a scaling factor of 0 if all max scores are either 0 or +Infty
- this.scalingFactor = scalingFactor.orElse(0);
this.maxScorePropagator = new MaxScoreSumPropagator(scorers);
+ final float maxScoreSum = maxScorePropagator.scoreSumUpperBound(maxScoreSumDouble);
+ this.scalingFactor = scalingFactor(maxScoreSum);
} else {
this.scalingFactor = 0;
this.maxScorePropagator = null;
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
index e09f3131eb5..bf7e3ff7547 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
@@ -54,8 +54,17 @@ public class TestWANDScorer extends LuceneTestCase {
private void doTestScalingFactor(float f) {
int scalingFactor = WANDScorer.scalingFactor(f);
float scaled = Math.scalb(f, scalingFactor);
- assertTrue("" + scaled, scaled > 1 << 15);
- assertTrue("" + scaled, scaled <= 1 << 16);
+ assertTrue("" + scaled, scaled >= 1 << (WANDScorer.FLOAT_MANTISSA_BITS - 1));
+ assertTrue("" + scaled, scaled < 1 << WANDScorer.FLOAT_MANTISSA_BITS);
+ }
+
+ public void testScaleMaxScore() {
+ assertEquals(
+ 1 << (WANDScorer.FLOAT_MANTISSA_BITS - 1),
+ WANDScorer.scaleMaxScore(32f, WANDScorer.scalingFactor(32f)));
+ assertEquals(1, WANDScorer.scaleMaxScore(32f, WANDScorer.scalingFactor(1L << 60)));
+ assertEquals(
+ 1, WANDScorer.scaleMaxScore(32f, WANDScorer.scalingFactor(Float.POSITIVE_INFINITY)));
}
private Query maybeWrap(Query query) {