You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by za...@apache.org on 2022/07/20 03:10:00 UTC
[lucene] branch branch_9x updated: LUCENE-10480: (Backporting) Use BulkScorer to limit BMMScorer to only top-level disjunctions (#1037)
This is an automated email from the ASF dual-hosted git repository.
zacharymorn pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 8ebb3305648 LUCENE-10480: (Backporting) Use BulkScorer to limit BMMScorer to only top-level disjunctions (#1037)
8ebb3305648 is described below
commit 8ebb3305648aea8f551c2dd144d5a527b8982638
Author: Zach Chen <za...@yahoo.com>
AuthorDate: Tue Jul 19 20:09:55 2022 -0700
LUCENE-10480: (Backporting) Use BulkScorer to limit BMMScorer to only top-level disjunctions (#1037)
---
.../lucene/search/Boolean2ScorerSupplier.java | 15 --
.../org/apache/lucene/search/BooleanWeight.java | 64 +++++++-
.../lucene/search/TestBlockMaxMaxscoreScorer.java | 173 +++++++++++++++------
.../org/apache/lucene/search/TestBooleanQuery.java | 110 +++++++++++++
4 files changed, 294 insertions(+), 68 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java
index 8ace300e2b6..bdf085d4669 100644
--- a/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java
+++ b/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java
@@ -118,21 +118,6 @@ final class Boolean2ScorerSupplier extends ScorerSupplier {
leadCost);
}
- // pure two terms disjunction
- if (scoreMode == ScoreMode.TOP_SCORES
- && minShouldMatch <= 1
- && subs.get(Occur.FILTER).isEmpty()
- && subs.get(Occur.MUST).isEmpty()
- && subs.get(Occur.MUST_NOT).isEmpty()
- && subs.get(Occur.SHOULD).size() == 2) {
-
- final List<Scorer> optionalScorers = new ArrayList<>();
- for (ScorerSupplier scorer : subs.get(Occur.SHOULD)) {
- optionalScorers.add(scorer.get(leadCost));
- }
- return new BlockMaxMaxscoreScorer(weight, optionalScorers);
- }
-
// pure disjunction
if (subs.get(Occur.FILTER).isEmpty() && subs.get(Occur.MUST).isEmpty()) {
return excl(
diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
index 42708d22cbc..d7279df8bbe 100644
--- a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
@@ -34,7 +34,7 @@ final class BooleanWeight extends Weight {
final BooleanQuery query;
- private static class WeightedBooleanClause {
+ protected static class WeightedBooleanClause {
final BooleanClause clause;
final Weight weight;
@@ -191,6 +191,63 @@ final class BooleanWeight extends Weight {
// or null if it is not applicable
// pkg-private for forcing use of BooleanScorer in tests
BulkScorer optionalBulkScorer(LeafReaderContext context) throws IOException {
+ if (scoreMode == ScoreMode.TOP_SCORES) {
+ if (!query.isPureDisjunction() || weightedClauses.size() > 2) {
+ return null;
+ }
+
+ List<ScorerSupplier> optional = new ArrayList<>();
+ for (WeightedBooleanClause wc : weightedClauses) {
+ Weight w = wc.weight;
+ BooleanClause c = wc.clause;
+ if (c.getOccur() != Occur.SHOULD) {
+ continue;
+ }
+ ScorerSupplier scorer = w.scorerSupplier(context);
+ if (scorer != null) {
+ optional.add(scorer);
+ }
+ }
+
+ if (optional.size() <= 1) {
+ return null;
+ }
+
+ List<Scorer> optionalScorers = new ArrayList<>();
+ for (ScorerSupplier ss : optional) {
+ optionalScorers.add(ss.get(Long.MAX_VALUE));
+ }
+
+ return new BulkScorer() {
+ final Scorer bmmScorer = new BlockMaxMaxscoreScorer(BooleanWeight.this, optionalScorers);
+ final DocIdSetIterator iterator = bmmScorer.iterator();
+
+ @Override
+ public int score(LeafCollector collector, Bits acceptDocs, int min, int max)
+ throws IOException {
+ collector.setScorer(bmmScorer);
+
+ int doc = bmmScorer.docID();
+ if (doc < min) {
+ doc = iterator.advance(min);
+ }
+ while (doc < max) {
+ if (acceptDocs == null || acceptDocs.get(doc)) {
+ collector.collect(doc);
+ }
+
+ doc = iterator.nextDoc();
+ }
+ return doc;
+ }
+
+ @Override
+ public long cost() {
+ return iterator.cost();
+ }
+ };
+ }
+
List<BulkScorer> optional = new ArrayList<BulkScorer>();
for (WeightedBooleanClause wc : weightedClauses) {
Weight w = wc.weight;
@@ -329,11 +386,6 @@ final class BooleanWeight extends Weight {
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
- if (scoreMode == ScoreMode.TOP_SCORES) {
- // If only the top docs are requested, use the default bulk scorer
- // so that we can dynamically prune non-competitive hits.
- return super.bulkScorer(context);
- }
final BulkScorer bulkScorer = booleanScorer(context);
if (bulkScorer != null) {
// bulk scoring is applicable, use it
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxMaxscoreScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxMaxscoreScorer.java
index 57d62cdc94a..287f2abaf98 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxMaxscoreScorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestBlockMaxMaxscoreScorer.java
@@ -18,15 +18,17 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.tests.search.AssertingScorer;
import org.apache.lucene.tests.util.LuceneTestCase;
// These basic tests are similar to some of the tests in TestWANDScorer, and may not need to be kept
@@ -62,26 +64,22 @@ public class TestBlockMaxMaxscoreScorer extends LuceneTestCase {
IndexSearcher searcher = newSearcher(reader);
Query query =
- new BooleanQuery.Builder()
- .add(
- new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
- BooleanClause.Occur.SHOULD)
- .add(
- new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
- BooleanClause.Occur.SHOULD)
- .build();
+ new BlockMaxMaxscoreQuery(
+ new BooleanQuery.Builder()
+ .add(
+ new BoostQuery(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
+ BooleanClause.Occur.SHOULD)
+ .add(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
+ BooleanClause.Occur.SHOULD)
+ .build());
Scorer scorer =
searcher
.createWeight(searcher.rewrite(query), ScoreMode.TOP_SCORES, 1)
.scorer(searcher.getIndexReader().leaves().get(0));
- if (scorer instanceof AssertingScorer) {
- assertTrue(((AssertingScorer) scorer).getIn() instanceof BlockMaxMaxscoreScorer);
- } else {
- assertTrue(scorer instanceof BlockMaxMaxscoreScorer);
- }
-
assertEquals(0, scorer.iterator().nextDoc());
assertEquals(2 + 1, scorer.score(), 0);
@@ -102,7 +100,7 @@ public class TestBlockMaxMaxscoreScorer extends LuceneTestCase {
}
}
- public void testBasicsWithThreeDisjunctionClausesNotUseBMMScorer() throws Exception {
+ public void testBasicsWithThreeDisjunctionClauses() throws Exception {
try (Directory dir = newDirectory()) {
writeDocuments(dir);
@@ -110,29 +108,26 @@ public class TestBlockMaxMaxscoreScorer extends LuceneTestCase {
IndexSearcher searcher = newSearcher(reader);
Query query =
- new BooleanQuery.Builder()
- .add(
- new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
- BooleanClause.Occur.SHOULD)
- .add(
- new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
- BooleanClause.Occur.SHOULD)
- .add(
- new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3),
- BooleanClause.Occur.SHOULD)
- .build();
+ new BlockMaxMaxscoreQuery(
+ new BooleanQuery.Builder()
+ .add(
+ new BoostQuery(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
+ BooleanClause.Occur.SHOULD)
+ .add(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
+ BooleanClause.Occur.SHOULD)
+ .add(
+ new BoostQuery(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3),
+ BooleanClause.Occur.SHOULD)
+ .build());
Scorer scorer =
searcher
.createWeight(searcher.rewrite(query), ScoreMode.TOP_SCORES, 1)
.scorer(searcher.getIndexReader().leaves().get(0));
- if (scorer instanceof AssertingScorer) {
- assertTrue(((AssertingScorer) scorer).getIn() instanceof WANDScorer);
- } else {
- assertTrue(scorer instanceof WANDScorer);
- }
-
assertEquals(0, scorer.iterator().nextDoc());
assertEquals(2 + 1, scorer.score(), 0);
@@ -163,15 +158,16 @@ public class TestBlockMaxMaxscoreScorer extends LuceneTestCase {
Query query =
new BooleanQuery.Builder()
.add(
- new BooleanQuery.Builder()
- .add(
- new BoostQuery(
- new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
- BooleanClause.Occur.SHOULD)
- .add(
- new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
- BooleanClause.Occur.SHOULD)
- .build(),
+ new BlockMaxMaxscoreQuery(
+ new BooleanQuery.Builder()
+ .add(
+ new BoostQuery(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
+ BooleanClause.Occur.SHOULD)
+ .add(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
+ BooleanClause.Occur.SHOULD)
+ .build()),
BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("foo", "C")), BooleanClause.Occur.FILTER)
.build();
@@ -214,11 +210,17 @@ public class TestBlockMaxMaxscoreScorer extends LuceneTestCase {
Query query =
new BooleanQuery.Builder()
.add(
- new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
- BooleanClause.Occur.SHOULD)
- .add(
- new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
- BooleanClause.Occur.SHOULD)
+ new BlockMaxMaxscoreQuery(
+ new BooleanQuery.Builder()
+ .add(
+ new BoostQuery(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
+ BooleanClause.Occur.SHOULD)
+ .add(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
+ BooleanClause.Occur.SHOULD)
+ .build()),
+ BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("foo", "C")), BooleanClause.Occur.MUST_NOT)
.build();
@@ -252,4 +254,81 @@ public class TestBlockMaxMaxscoreScorer extends LuceneTestCase {
}
}
}
+
+ private static class BlockMaxMaxscoreQuery extends Query {
+ private final BooleanQuery query;
+
+ private BlockMaxMaxscoreQuery(BooleanQuery query) {
+ assert query.isPureDisjunction()
+ : "This test utility query is only used to create BlockMaxMaxscoreScorer for disjunctions.";
+ assert query.clauses().size() >= 2
+ : "There must be at least two optional clauses to use this test utility query.";
+ this.query = query;
+ }
+
+ @Override
+ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
+ throws IOException {
+ return new Weight(query) {
+
+ @Override
+ public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+ // no-ops
+ return null;
+ }
+
+ @Override
+ public Scorer scorer(LeafReaderContext context) throws IOException {
+ BooleanWeight weight = (BooleanWeight) query.createWeight(searcher, scoreMode, boost);
+ List<Scorer> optionalScorers =
+ weight.weightedClauses.stream()
+ .map(wc -> wc.weight)
+ .map(
+ w -> {
+ try {
+ return w.scorerSupplier(context);
+ } catch (IOException e) {
+ throw new AssertionError(e);
+ }
+ })
+ .map(
+ ss -> {
+ try {
+ return ss.get(Long.MAX_VALUE);
+ } catch (IOException e) {
+ throw new AssertionError(e);
+ }
+ })
+ .collect(Collectors.toList());
+
+ return new BlockMaxMaxscoreScorer(weight, optionalScorers);
+ }
+
+ @Override
+ public boolean isCacheable(LeafReaderContext ctx) {
+ return false;
+ }
+ };
+ }
+
+ @Override
+ public String toString(String field) {
+ return "BlockMaxMaxscoreQuery";
+ }
+
+ @Override
+ public void visit(QueryVisitor visitor) {
+ // no-ops
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return sameClassAs(other) && query.equals(((BlockMaxMaxscoreQuery) other).query);
+ }
+
+ @Override
+ public int hashCode() {
+ return 31 * classHash() + query.hashCode();
+ }
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java
index a5710121acf..d3adccb7e22 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java
@@ -16,6 +16,9 @@
*/
package org.apache.lucene.search;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
+
+import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import java.io.IOException;
import java.util.ArrayList;
@@ -905,6 +908,113 @@ public class TestBooleanQuery extends LuceneTestCase {
dir.close();
}
+ // test BlockMaxMaxscoreScorer
+ public void testDisjunctionTwoClausesMatchesCountAndScore() throws Exception {
+ List<String[]> docContent =
+ Arrays.asList(
+ new String[] {"A", "B"}, // 0
+ new String[] {"A"}, // 1
+ new String[] {}, // 2
+ new String[] {"A", "B", "C"}, // 3
+ new String[] {"B"}, // 4
+ new String[] {"B", "C"} // 5
+ );
+
+ // result sorted by score
+ int[][] matchDocScore = {
+ {0, 2 + 1},
+ {3, 2 + 1},
+ {1, 2},
+ {4, 1},
+ {5, 1}
+ };
+
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w =
+ new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
+
+ for (String[] values : docContent) {
+ Document doc = new Document();
+ for (String value : values) {
+ doc.add(new StringField("foo", value, Field.Store.NO));
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ }
+
+ try (IndexReader reader = DirectoryReader.open(dir)) {
+ IndexSearcher searcher = newSearcher(reader);
+
+ Query query =
+ new BooleanQuery.Builder()
+ .add(
+ new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
+ BooleanClause.Occur.SHOULD)
+ .add(
+ new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))),
+ BooleanClause.Occur.SHOULD)
+ .build();
+
+ TopDocs topDocs = searcher.search(query, 10);
+
+ for (int i = 0; i < topDocs.scoreDocs.length; i++) {
+ ScoreDoc scoreDoc = topDocs.scoreDocs[i];
+ assertEquals(matchDocScore[i][0], scoreDoc.doc);
+ assertEquals(matchDocScore[i][1], scoreDoc.score, 0);
+ }
+ }
+ }
+ }
+
+ public void testDisjunctionRandomClausesMatchesCount() throws Exception {
+ int numFieldValue = RandomNumbers.randomIntBetween(random(), 1, 10);
+ int[] numDocsPerFieldValue = new int[numFieldValue];
+ int allDocsCount = 0;
+
+ for (int i = 0; i < numDocsPerFieldValue.length; i++) {
+ int numDocs = RandomNumbers.randomIntBetween(random(), 10, 50);
+ numDocsPerFieldValue[i] = numDocs;
+ allDocsCount += numDocs;
+ }
+
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w =
+ new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
+
+ for (int i = 0; i < numFieldValue; i++) {
+ for (int j = 0; j < numDocsPerFieldValue[i]; j++) {
+ Document doc = new Document();
+ doc.add(new StringField("field", String.valueOf(i), Field.Store.NO));
+ w.addDocument(doc);
+ }
+ }
+
+ w.forceMerge(1);
+ }
+
+ int matchedDocsCount = 0;
+ try (IndexReader reader = DirectoryReader.open(dir)) {
+ IndexSearcher searcher = newSearcher(reader);
+
+ BooleanQuery.Builder builder = new BooleanQuery.Builder();
+
+ for (int i = 0; i < numFieldValue; i++) {
+ if (randomBoolean()) {
+ matchedDocsCount += numDocsPerFieldValue[i];
+ builder.add(
+ new TermQuery(new Term("field", String.valueOf(i))), BooleanClause.Occur.SHOULD);
+ }
+ }
+
+ Query query = builder.build();
+
+ TopDocs topDocs = searcher.search(query, allDocsCount);
+ assertEquals(matchedDocsCount, topDocs.scoreDocs.length);
+ }
+ }
+ }
+
public void testProhibitedMatchesCount() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());