You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/01 21:04:52 UTC
[lucene] branch main updated: LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 680f21dca5f LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)
680f21dca5f is described below
commit 680f21dca5fe9d1bee85f2a0ea688119fda49e80
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Thu Sep 1 14:04:43 2022 -0700
LUCENE-10207: TermInSetQuery now provides a ScoreSupplier with cost estimation for use in IndexOrDocValuesQuery (#1058)
---
lucene/CHANGES.txt | 3 +
.../org/apache/lucene/search/TermInSetQuery.java | 66 ++++++++++++++++++++--
2 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 02956ce0b37..bb3c16dd612 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -101,6 +101,9 @@ Improvements
---------------------
* LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)
+* LUCENE-10207: TermInSetQuery can now provide a ScoreSupplier with cost estimation, making it
+ usable in IndexOrDocValuesQuery. (Greg Miller)
+
* GITHUB#11715: Add Integer awareness to RamUsageEstimator.sizeOf (Mike Drob)
Optimizations
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 6fd2e5f4d12..cd44ab5e1b2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -353,16 +353,70 @@ public class TermInSetQuery extends Query implements Accountable {
}
}
+ @Override
+ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+ Terms indexTerms = context.reader().terms(field);
+ if (indexTerms == null) {
+ return null;
+ }
+
+ // Cost estimation reasoning is:
+ // 1. Assume every query term matches at least one document (queryTermsCount).
+ // 2. Determine the total number of docs beyond the first one for each term.
+ // That count provides a ceiling on the number of extra docs that could match beyond
+ // that first one. (We omit the first since it's already been counted in #1).
+ // This approach still provides correct worst-case cost in general, but provides tighter
+ // estimates for primary-key-like fields. See: LUCENE-10207
+
+ // TODO: This cost estimation may grossly overestimate since we have no index statistics
+ // for the specific query terms. While it's nice to avoid the cost of intersecting the
+ // query terms with the index, it could be beneficial to do that work and get better
+ // cost estimates.
+ final long cost;
+ final long queryTermsCount = termData.size();
+ long potentialExtraCost = indexTerms.getSumDocFreq();
+ final long indexedTermCount = indexTerms.size();
+ if (indexedTermCount != -1) {
+ potentialExtraCost -= indexedTermCount;
+ }
+ cost = queryTermsCount + potentialExtraCost;
+
+ final Weight weight = this;
+ return new ScorerSupplier() {
+ @Override
+ public Scorer get(long leadCost) throws IOException {
+ WeightOrDocIdSet weightOrDocIdSet = rewrite(context);
+ if (weightOrDocIdSet == null) {
+ return null;
+ }
+
+ final Scorer scorer;
+ if (weightOrDocIdSet.weight != null) {
+ scorer = weightOrDocIdSet.weight.scorer(context);
+ } else {
+ scorer = scorer(weightOrDocIdSet.set);
+ }
+
+ return Objects.requireNonNullElseGet(
+ scorer,
+ () ->
+ new ConstantScoreScorer(weight, score(), scoreMode, DocIdSetIterator.empty()));
+ }
+
+ @Override
+ public long cost() {
+ return cost;
+ }
+ };
+ }
+
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
- final WeightOrDocIdSet weightOrBitSet = rewrite(context);
- if (weightOrBitSet == null) {
+ final ScorerSupplier supplier = scorerSupplier(context);
+ if (supplier == null) {
return null;
- } else if (weightOrBitSet.weight != null) {
- return weightOrBitSet.weight.scorer(context);
- } else {
- return scorer(weightOrBitSet.set);
}
+ return supplier.get(Long.MAX_VALUE);
}
@Override