You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/29 14:14:46 UTC
[lucene] branch branch_9x updated: TermInSetQuery optimization when all docs in a field match a term (#11828)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new cdf6a1a2e69 TermInSetQuery optimization when all docs in a field match a term (#11828)
cdf6a1a2e69 is described below
commit cdf6a1a2e690a49ef895d5b9c8c80eaa07118615
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Thu Sep 29 06:59:59 2022 -0700
TermInSetQuery optimization when all docs in a field match a term (#11828)
---
lucene/CHANGES.txt | 4 ++++
.../java/org/apache/lucene/search/TermInSetQuery.java | 16 ++++++++++++++--
.../org/apache/lucene/search/TestTermInSetQuery.java | 9 ++++++++-
3 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 94066cb52a0..e2caacb7337 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -46,6 +46,10 @@ Optimizations
* GITHUB#11803: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
+* GITHUB#11828: Tweak TermInSetQuery "dense" optimization to only require all terms present in a
+ given field to match a term (rather than all docs in a segment). This is consistent with
+ MultiTermQueryConstantScoreWrapper. (Greg Miller)
+
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 20982d6be63..31577a7c406 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -262,6 +262,7 @@ public class TermInSetQuery extends Query implements Accountable {
if (terms == null) {
return null;
}
+ final int fieldDocCount = terms.getDocCount();
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
@@ -277,8 +278,18 @@ public class TermInSetQuery extends Query implements Accountable {
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
assert field.equals(iterator.field());
if (termsEnum.seekExact(term)) {
- if (reader.maxDoc() == termsEnum.docFreq()) {
- return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
+ // If a term contains all docs with a value for the specified field (likely rare),
+ // we can discard the other terms and just use the dense term's postings:
+ int docFreq = termsEnum.docFreq();
+ if (fieldDocCount == docFreq) {
+ TermStates termStates = new TermStates(searcher.getTopReaderContext());
+ termStates.register(
+ termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq());
+ Query q =
+ new ConstantScoreQuery(
+ new TermQuery(new Term(field, termsEnum.term()), termStates));
+ Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
+ return new WeightOrDocIdSet(weight);
}
if (matchingTerms == null) {
@@ -300,6 +311,7 @@ public class TermInSetQuery extends Query implements Accountable {
}
}
}
+
if (matchingTerms != null) {
assert builder == null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java
index 731e6694da2..670e699b43f 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java
@@ -51,7 +51,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
public class TestTermInSetQuery extends LuceneTestCase {
- public void testAllDocsTerm() throws IOException {
+ public void testAllDocsInFieldTerm() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
String field = "f";
@@ -69,6 +69,7 @@ public class TestTermInSetQuery extends LuceneTestCase {
otherTerms[idx++] = term;
}
+ // Every doc with a value for `field` will contain `denseTerm`:
int numDocs = 10 * otherTerms.length;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
@@ -78,6 +79,12 @@ public class TestTermInSetQuery extends LuceneTestCase {
iw.addDocument(doc);
}
+ // Make sure there are some docs in the index that don't contain a value for the field at all:
+ for (int i = 0; i < 100; i++) {
+ Document doc = new Document();
+ doc.add(new StringField("foo", "bar", Store.NO));
+ }
+
IndexReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();