You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/26 17:39:54 UTC
[lucene] branch main updated: Optimize MultiTermQueryConstantScoreWrapper for case when a term matches all docs in a segment. (#11738)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 734841d6c0f Optimize MultiTermQueryConstantScoreWrapper for case when a term matches all docs in a segment. (#11738)
734841d6c0f is described below
commit 734841d6c0f82aa170f882a7d1cb891e87d145df
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Mon Sep 26 10:39:47 2022 -0700
Optimize MultiTermQueryConstantScoreWrapper for case when a term matches all docs in a segment. (#11738)
---
lucene/CHANGES.txt | 5 +++
.../search/MultiTermQueryConstantScoreWrapper.java | 40 ++++++++++++++++------
2 files changed, 35 insertions(+), 10 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ea22d302723..da470df6e54 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -115,6 +115,11 @@ Bug Fixes
trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand)
+Optimizations
+---------------------
+* GITHUB#11738: Optimize MultiTermQueryConstantScoreWrapper when a term is present that matches all
+ docs in a segment. (Greg Miller)
+
* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
(Luke Kot-Zaniewski)
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java
index f729ed6421c..1271b3c5571 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java
@@ -125,11 +125,11 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return new ConstantScoreWeight(this, boost) {
/**
- * Try to collect terms from the given terms enum and return true iff all terms could be
- * collected. If {@code false} is returned, the enum is left positioned on the next term.
+ * Try to collect terms from the given terms enum and return true if all terms could be
+ * collected or if one of the iterated terms contains all docs for the field. If {@code false}
+ * is returned, the enum is left positioned on the next term.
*/
- private boolean collectTerms(
- LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms)
+ private boolean collectTerms(int fieldDocCount, TermsEnum termsEnum, List<TermAndState> terms)
throws IOException {
final int threshold =
Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, IndexSearcher.getMaxClauseCount());
@@ -139,12 +139,18 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return true;
}
TermState state = termsEnum.termState();
- terms.add(
+ int docFreq = termsEnum.docFreq();
+ TermAndState termAndState =
new TermAndState(
- BytesRef.deepCopyOf(term),
- state,
- termsEnum.docFreq(),
- termsEnum.totalTermFreq()));
+ BytesRef.deepCopyOf(term), state, docFreq, termsEnum.totalTermFreq());
+ if (fieldDocCount == docFreq) {
+ // If the term contains every document with a value for the field, we can ignore all
+ // other terms:
+ terms.clear();
+ terms.add(termAndState);
+ return true;
+ }
+ terms.add(termAndState);
}
return termsEnum.next() == null;
}
@@ -160,13 +166,14 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return new WeightOrDocIdSet((DocIdSet) null);
}
+ final int fieldDocCount = terms.getDocCount();
final TermsEnum termsEnum = query.getTermsEnum(terms);
assert termsEnum != null;
PostingsEnum docs = null;
final List<TermAndState> collectedTerms = new ArrayList<>();
- if (collectTerms(context, termsEnum, collectedTerms)) {
+ if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
// build a boolean query
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : collectedTerms) {
@@ -193,6 +200,19 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
// Then keep filling the bit set with remaining terms
do {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
+ // If a term contains all docs with a value for the specified field, we can discard the
+ // other terms and just use the dense term's postings:
+ int docFreq = termsEnum.docFreq();
+ if (fieldDocCount == docFreq) {
+ TermStates termStates = new TermStates(searcher.getTopReaderContext());
+ termStates.register(
+ termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq());
+ Query q =
+ new ConstantScoreQuery(
+ new TermQuery(new Term(query.field, termsEnum.term()), termStates));
+ Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
+ return new WeightOrDocIdSet(weight);
+ }
builder.add(docs);
} while (termsEnum.next() != null);