You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/19 13:30:53 UTC

[lucene] branch branch_9x updated: LUCENE-10153: Improve accuracy of scaled scores in WANDScorer. (#794)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new db0e712cad3 LUCENE-10153: Improve accuracy of scaled scores in WANDScorer. (#794)
db0e712cad3 is described below

commit db0e712cad3a23e58a66c7c3aa1a9a8b0e217823
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Apr 19 15:26:24 2022 +0200

    LUCENE-10153: Improve accuracy of scaled scores in WANDScorer. (#794)
---
 lucene/CHANGES.txt                                 |  3 ++
 .../lucene/search/MaxScoreSumPropagator.java       |  2 +-
 .../java/org/apache/lucene/search/WANDScorer.java  | 50 +++++++++++-----------
 .../org/apache/lucene/search/TestWANDScorer.java   | 13 +++++-
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 065ff6ac558..32f2cb38007 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -59,6 +59,9 @@ Optimizations
 
 * LUCENE-10481: FacetsCollector will not request scores if it does not use them. (Mike Drob)
 
+* LUCENE-10153: Potential speedup for pure disjunctions whose clauses produce
+  scores that are very close to each other. (Adrien Grand)
+
 Bug Fixes
 ---------------------
 * LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite
diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java
index 1557cd9ae88..e224d729555 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreSumPropagator.java
@@ -166,7 +166,7 @@ final class MaxScoreSumPropagator {
     return Math.max(minScore, 0f);
   }
 
-  private float scoreSumUpperBound(double sum) {
+  float scoreSumUpperBound(double sum) {
     if (numClauses <= 2) {
       // When there are only two clauses, the sum is always the same regardless
       // of the order.
diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
index 3a655fa153f..a6ba7b0d29f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java
@@ -25,7 +25,6 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
-import java.util.OptionalInt;
 
 /**
  * This implements the WAND (Weak AND) algorithm for dynamic pruning described in "Efficient Query
@@ -53,9 +52,12 @@ import java.util.OptionalInt;
  */
 final class WANDScorer extends Scorer {
 
+  static final int FLOAT_MANTISSA_BITS = 24;
+  private static final long MAX_SCALED_SCORE = (1L << 24) - 1;
+
   /**
    * Return a scaling factor for the given float so that {@code f x 2^scalingFactor} would be in
-   * {@code ]2^15, 2^16]}. Special cases:
+   * {@code [2^23, 2^24[}. Special cases:
    *
    * <pre>
    *    scalingFactor(0) = scalingFactor(MIN_VALUE) - 1
@@ -74,7 +76,7 @@ final class WANDScorer extends Scorer {
       // Since doubles have more amplitude than floats for the
       // exponent, the cast produces a normal value.
       assert d == 0 || Math.getExponent(d) >= Double.MIN_EXPONENT; // normal double
-      return 15 - Math.getExponent(Math.nextDown(d));
+      return FLOAT_MANTISSA_BITS - 1 - Math.getExponent(d);
     }
   }
 
@@ -83,23 +85,21 @@ final class WANDScorer extends Scorer {
    * are used) as well as floating-point arithmetic errors. Those are rounded up in order to make
    * sure we do not miss any matches.
    */
-  private static long scaleMaxScore(float maxScore, int scalingFactor) {
+  static long scaleMaxScore(float maxScore, int scalingFactor) {
+    assert scalingFactor(maxScore) >= scalingFactor;
     assert Float.isNaN(maxScore) == false;
     assert maxScore >= 0;
 
     // NOTE: because doubles have more amplitude than floats for the
     // exponent, the scalb call produces an accurate value.
-    double scaled = Math.scalb((double) maxScore, scalingFactor);
-
-    if (scaled > 1 << 16) {
-      // This happens if either maxScore is +Infty, or we have a scorer that
-      // returned +Infty as its maximum score over the whole range of doc IDs
-      // when computing the scaling factor in the constructor, and now returned
-      // a finite maximum score for a smaller range of doc IDs.
-      return (1L << 32) - 1; // means +Infinity in practice for this scorer
+    final double scaled = Math.scalb((double) maxScore, scalingFactor);
+
+    if (scaled > MAX_SCALED_SCORE) {
+      // This happens if one scorer returns +Infty as a max score
+      return MAX_SCALED_SCORE;
     }
 
-    return (long) Math.ceil(scaled); // round up, cast is accurate since value is <= 2^16
+    return (long) Math.ceil(scaled); // round up, cast is accurate since value is < 2^24
   }
 
   /**
@@ -107,7 +107,7 @@ final class WANDScorer extends Scorer {
    * to make sure that we do not miss any matches.
    */
   private static long scaleMinScore(float minScore, int scalingFactor) {
-    assert Float.isNaN(minScore) == false;
+    assert Float.isFinite(minScore);
     assert minScore >= 0;
 
     // like for scaleMaxScore, this scalb call is accurate
@@ -171,21 +171,23 @@ final class WANDScorer extends Scorer {
     tail = new DisiWrapper[scorers.size()];
 
     if (this.scoreMode == ScoreMode.TOP_SCORES) {
-      OptionalInt scalingFactor = OptionalInt.empty();
+      // To avoid accuracy issues with floating-point numbers, this scorer operates on scaled longs.
+      // How do you choose the scaling factor? The thing is that we want to retain as many
+      // significant bits as possible, but not too many, otherwise operations on longs would be more
+      // precise than the equivalent operations on their unscaled counterparts and we might skip too
+      // many hits. So we compute the maximum possible score produced by this scorer, which is the
+      // sum of the maximum scores of each clause, and compute a scaling factor that would preserve
+      // 24 bits of accuracy - the number of mantissa bits of single-precision floating-point
+      // numbers.
+      double maxScoreSumDouble = 0;
       for (Scorer scorer : scorers) {
         scorer.advanceShallow(0);
         float maxScore = scorer.getMaxScore(DocIdSetIterator.NO_MORE_DOCS);
-        if (maxScore != 0 && Float.isFinite(maxScore)) {
-          // 0 and +Infty should not impact the scale
-          scalingFactor =
-              OptionalInt.of(
-                  Math.min(scalingFactor.orElse(Integer.MAX_VALUE), scalingFactor(maxScore)));
-        }
+        maxScoreSumDouble += maxScore;
       }
-
-      // Use a scaling factor of 0 if all max scores are either 0 or +Infty
-      this.scalingFactor = scalingFactor.orElse(0);
       this.maxScorePropagator = new MaxScoreSumPropagator(scorers);
+      final float maxScoreSum = maxScorePropagator.scoreSumUpperBound(maxScoreSumDouble);
+      this.scalingFactor = scalingFactor(maxScoreSum);
     } else {
       this.scalingFactor = 0;
       this.maxScorePropagator = null;
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
index e09f3131eb5..bf7e3ff7547 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestWANDScorer.java
@@ -54,8 +54,17 @@ public class TestWANDScorer extends LuceneTestCase {
   private void doTestScalingFactor(float f) {
     int scalingFactor = WANDScorer.scalingFactor(f);
     float scaled = Math.scalb(f, scalingFactor);
-    assertTrue("" + scaled, scaled > 1 << 15);
-    assertTrue("" + scaled, scaled <= 1 << 16);
+    assertTrue("" + scaled, scaled >= 1 << (WANDScorer.FLOAT_MANTISSA_BITS - 1));
+    assertTrue("" + scaled, scaled < 1 << WANDScorer.FLOAT_MANTISSA_BITS);
+  }
+
+  public void testScaleMaxScore() {
+    assertEquals(
+        1 << (WANDScorer.FLOAT_MANTISSA_BITS - 1),
+        WANDScorer.scaleMaxScore(32f, WANDScorer.scalingFactor(32f)));
+    assertEquals(1, WANDScorer.scaleMaxScore(32f, WANDScorer.scalingFactor(1L << 60)));
+    assertEquals(
+        1, WANDScorer.scaleMaxScore(32f, WANDScorer.scalingFactor(Float.POSITIVE_INFINITY)));
   }
 
   private Query maybeWrap(Query query) {