You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/01 21:42:48 UTC
[lucene] branch branch_9x updated: LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)

This is an automated email from the ASF dual-hosted git repository.

gsmiller pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 9ab4620e3ff LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)
9ab4620e3ff is described below

commit 9ab4620e3ff14381b18b95295577a7ba657bb0c8
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Thu Sep 1 14:04:43 2022 -0700

    LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)
---
 lucene/CHANGES.txt                                 |  3 +
 .../org/apache/lucene/search/TermInSetQuery.java   | 66 ++++++++++++++++++++--
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4dfeeb0ac93..04e46a2ee33 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,6 +23,9 @@ Improvements
 ---------------------
 * LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)
 
+* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it
+  usable in IndexOrDocValuesQuery. (Greg Miller)
+
 * LUCENE-10216: Use MergePolicy to define and MergeScheduler to trigger the reader merges
   required by addIndexes(CodecReader[]) API. (Vigya Sharma, Michael McCandless)
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 962cca6f714..b09fc9f8b3c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -349,16 +349,70 @@ public class TermInSetQuery extends Query implements Accountable {
         }
       }
 
+      @Override
+      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+        Terms indexTerms = context.reader().terms(field);
+        if (indexTerms == null) {
+          return null;
+        }
+
+        // Cost estimation reasoning is:
+        //  1. Assume every query term matches at least one document (queryTermsCount).
+        //  2. Determine the total number of docs beyond the first one for each term.
+        //     That count provides a ceiling on the number of extra docs that could match beyond
+        //     that first one. (We omit the first since it's already been counted in #1).
+        // This approach still provides correct worst-case cost in general, but provides tighter
+        // estimates for primary-key-like fields. See: LUCENE-10207
+
+        // TODO: This cost estimation may grossly overestimate since we have no index statistics
+        // for the specific query terms. While it's nice to avoid the cost of intersecting the
+        // query terms with the index, it could be beneficial to do that work and get better
+        // cost estimates.
+        final long cost;
+        final long queryTermsCount = termData.size();
+        long potentialExtraCost = indexTerms.getSumDocFreq();
+        final long indexedTermCount = indexTerms.size();
+        if (indexedTermCount != -1) {
+          potentialExtraCost -= indexedTermCount;
+        }
+        cost = queryTermsCount + potentialExtraCost;
+
+        final Weight weight = this;
+        return new ScorerSupplier() {
+          @Override
+          public Scorer get(long leadCost) throws IOException {
+            WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
+            if (weightOrDocIdSet == null) {
+              return null;
+            }
+
+            final Scorer scorer;
+            if (weightOrDocIdSet.weight != null) {
+              scorer = weightOrDocIdSet.weight.scorer(context);
+            } else {
+              scorer = scorer(weightOrDocIdSet.set);
+            }
+
+            return Objects.requireNonNullElseGet(
+                scorer,
+                () ->
+                    new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty()));
+          }
+
+          @Override
+          public long cost() {
+            return cost;
+          }
+        };
+      }
+
       @Override
       public Scorer scorer(LeafReaderContext context) throws IOException {
-        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
-        if (weightOrBitSet == null) {
+        final ScorerSupplier supplier = scorerSupplier(context);
+        if (supplier == null) {
           return null;
-        } else if (weightOrBitSet.weight != null) {
-          return weightOrBitSet.weight.scorer(context);
-        } else {
-          return scorer(weightOrBitSet.set);
         }
+        return supplier.get(Long.MAX_VALUE);
       }
 
       @Override