You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2022/06/22 03:51:32 UTC
[GitHub] [lucene] zacharymorn opened a new pull request, #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
zacharymorn opened a new pull request, #972:
URL: https://github.com/apache/lucene/pull/972
### Description (or a Jira issue link if you have one)
Use Block-Max-Maxscore algorithm for 2 clauses disjunction. Adapted from PR https://github.com/apache/lucene/pull/101
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r909131493
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
+ w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+ } else {
+ // This scorer won't be able to contribute to match for target, setting its maxScore
+ // to 0 so it goes into nonEssentialList
+ w.maxScoreFloat = 0;
+ }
+ }
+ }
+
+ private void repartitionLists() {
+ essentialsScorers.clear();
+ maxScoreSortedEssentialScorers.clear();
+ Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxScoreFloat));
+
+ // Re-partition the scorers into non-essential list and essential list, as defined in
+ // the "Optimizing Top-k Document Retrieval Strategies for Block-Max Indexes" paper.
+ nonEssentialMaxScoreSum = 0;
+ for (DisiWrapper w : allScorers) {
+ if (nonEssentialMaxScoreSum + w.maxScoreFloat < minCompetitiveScore) {
+ nonEssentialMaxScoreSum += w.maxScoreFloat;
+ } else {
+ maxScoreSortedEssentialScorers.add(w);
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ @Override
+ public long cost() {
+ // fixed at initialization
+ return cost;
+ }
+ };
+
+ return new TwoPhaseIterator(approximation) {
+ @Override
+ public boolean matches() throws IOException {
+ return score() >= minCompetitiveScore;
+ }
+
+ @Override
+ public float matchCost() {
+ // maximum number of scorer that matches() might advance
+ return allScorers.length - essentialsScorers.size();
Review Comment:
I see. I've updated it to use the length of all scorers to over-estimate the cost, since either essential or non-essential list may change in length.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
Review Comment:
Ah I see. I have replaced it with a double.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
Review Comment:
Done. `Boolean2ScorerSupplier` already checks for `scoreMode == ScoreMode.TOP_SCORES` before using this scorer.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1163599006
My best guess would be that you are seeing different results mostly because luceneutil picks random queries, and the run that only had disjunctions picked queries that happened to like your change better than the run that included all tasks? These are very impressive speedups indeed!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1169526849
Here are the latest benchmark results after the update:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
Prefix3 196.69 (6.3%) 189.02 (8.6%) -3.9% ( -17% - 11%) 0.102
HighTermTitleBDVSort 33.24 (11.6%) 32.31 (10.3%) -2.8% ( -22% - 21%) 0.421
HighTermDayOfYearSort 137.58 (10.1%) 135.58 (9.7%) -1.4% ( -19% - 20%) 0.644
OrHighMedDayTaxoFacets 25.11 (6.6%) 24.81 (8.8%) -1.2% ( -15% - 15%) 0.620
Wildcard 348.15 (6.8%) 343.94 (6.3%) -1.2% ( -13% - 12%) 0.559
TermDTSort 188.94 (10.2%) 187.75 (9.7%) -0.6% ( -18% - 21%) 0.841
HighTermMonthSort 192.52 (9.7%) 191.37 (9.3%) -0.6% ( -17% - 20%) 0.842
MedTerm 2947.19 (3.0%) 2936.13 (3.7%) -0.4% ( -6% - 6%) 0.726
HighTerm 3104.91 (3.5%) 3100.88 (5.0%) -0.1% ( -8% - 8%) 0.925
LowSloppyPhrase 54.77 (0.8%) 54.75 (2.0%) -0.0% ( -2% - 2%) 0.935
MedTermDayTaxoFacets 93.60 (4.1%) 93.60 (4.6%) 0.0% ( -8% - 9%) 0.998
OrNotHighMed 1661.26 (2.1%) 1661.95 (3.1%) 0.0% ( -5% - 5%) 0.960
AndHighHigh 63.20 (4.6%) 63.25 (5.1%) 0.1% ( -9% - 10%) 0.956
AndHighHighDayTaxoFacets 58.91 (1.1%) 58.96 (1.4%) 0.1% ( -2% - 2%) 0.831
HighPhrase 1040.52 (2.0%) 1041.64 (2.2%) 0.1% ( -3% - 4%) 0.869
MedPhrase 61.40 (2.5%) 61.48 (3.3%) 0.1% ( -5% - 6%) 0.887
MedSloppyPhrase 156.44 (1.6%) 156.80 (3.9%) 0.2% ( -5% - 5%) 0.808
LowPhrase 348.79 (1.1%) 349.68 (2.1%) 0.3% ( -2% - 3%) 0.633
Fuzzy1 141.81 (2.3%) 142.24 (1.6%) 0.3% ( -3% - 4%) 0.633
OrHighNotMed 1471.87 (2.6%) 1476.69 (3.5%) 0.3% ( -5% - 6%) 0.737
OrNotHighHigh 1115.16 (2.6%) 1119.36 (3.6%) 0.4% ( -5% - 6%) 0.704
OrHighNotHigh 1434.59 (3.0%) 1440.05 (2.8%) 0.4% ( -5% - 6%) 0.679
AndHighMedDayTaxoFacets 117.61 (2.8%) 118.17 (3.3%) 0.5% ( -5% - 6%) 0.623
AndHighMed 122.71 (4.5%) 123.41 (5.3%) 0.6% ( -8% - 10%) 0.716
HighSloppyPhrase 10.46 (2.6%) 10.52 (3.7%) 0.6% ( -5% - 7%) 0.552
Respell 77.60 (2.9%) 78.10 (2.4%) 0.6% ( -4% - 6%) 0.453
LowIntervalsOrdered 59.06 (2.0%) 59.52 (2.4%) 0.8% ( -3% - 5%) 0.263
BrowseDateSSDVFacets 4.59 (32.0%) 4.63 (32.4%) 0.8% ( -48% - 96%) 0.938
LowSpanNear 168.24 (2.0%) 169.63 (2.3%) 0.8% ( -3% - 5%) 0.225
MedSpanNear 29.72 (2.7%) 29.98 (3.1%) 0.9% ( -4% - 6%) 0.354
AndHighLow 1433.34 (5.4%) 1446.44 (5.3%) 0.9% ( -9% - 12%) 0.589
OrHighNotLow 1989.50 (3.0%) 2010.48 (4.2%) 1.1% ( -5% - 8%) 0.359
PKLookup 284.02 (4.4%) 287.16 (5.4%) 1.1% ( -8% - 11%) 0.476
HighSpanNear 11.27 (3.3%) 11.39 (4.3%) 1.1% ( -6% - 9%) 0.346
Fuzzy2 161.50 (2.4%) 163.37 (1.2%) 1.2% ( -2% - 4%) 0.051
OrNotHighLow 1490.10 (3.6%) 1511.49 (3.5%) 1.4% ( -5% - 8%) 0.198
BrowseRandomLabelSSDVFacets 20.26 (16.3%) 20.65 (5.0%) 1.9% ( -16% - 27%) 0.616
LowTerm 4286.35 (3.9%) 4381.66 (5.6%) 2.2% ( -7% - 12%) 0.145
MedIntervalsOrdered 23.28 (4.9%) 23.83 (6.4%) 2.4% ( -8% - 14%) 0.185
BrowseDayOfYearSSDVFacets 26.94 (3.8%) 27.59 (9.0%) 2.4% ( -10% - 15%) 0.268
BrowseMonthSSDVFacets 29.56 (9.1%) 30.49 (13.4%) 3.2% ( -17% - 28%) 0.384
HighIntervalsOrdered 11.74 (6.8%) 12.22 (7.5%) 4.1% ( -9% - 19%) 0.071
BrowseMonthTaxoFacets 29.08 (39.3%) 31.68 (44.3%) 8.9% ( -53% - 152%) 0.500
OrHighHigh 49.41 (5.7%) 54.49 (3.8%) 10.3% ( 0% - 20%) 0.000
IntNRQ 127.81 (22.1%) 141.76 (18.7%) 10.9% ( -24% - 66%) 0.092
BrowseDateTaxoFacets 29.23 (43.8%) 32.65 (44.2%) 11.7% ( -53% - 177%) 0.401
BrowseDayOfYearTaxoFacets 29.30 (43.4%) 32.77 (43.9%) 11.8% ( -52% - 175%) 0.391
BrowseRandomLabelTaxoFacets 29.54 (55.5%) 33.08 (56.1%) 12.0% ( -64% - 277%) 0.498
OrHighLow 764.71 (3.7%) 1192.43 (4.0%) 55.9% ( 46% - 66%) 0.000
OrHighMed 210.87 (4.8%) 332.88 (4.9%) 57.9% ( 45% - 70%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
IntNRQ 115.39 (24.7%) 105.19 (26.0%) -8.8% ( -47% - 55%) 0.270
TermDTSort 344.09 (7.6%) 331.37 (2.6%) -3.7% ( -12% - 7%) 0.039
HighTermDayOfYearSort 144.41 (8.0%) 139.75 (4.6%) -3.2% ( -14% - 10%) 0.117
BrowseMonthSSDVFacets 29.55 (13.8%) 28.65 (13.5%) -3.0% ( -26% - 28%) 0.481
HighTermMonthSort 135.90 (7.3%) 133.66 (6.6%) -1.6% ( -14% - 13%) 0.454
AndHighHigh 76.59 (4.6%) 75.54 (3.4%) -1.4% ( -8% - 6%) 0.284
LowSloppyPhrase 28.97 (3.5%) 28.78 (3.0%) -0.7% ( -6% - 6%) 0.529
HighSloppyPhrase 39.97 (2.5%) 39.83 (2.8%) -0.4% ( -5% - 5%) 0.667
MedTermDayTaxoFacets 46.96 (2.7%) 46.83 (4.1%) -0.3% ( -6% - 6%) 0.795
AndHighMed 407.06 (5.4%) 406.05 (4.4%) -0.2% ( -9% - 10%) 0.872
LowPhrase 24.42 (2.4%) 24.40 (2.4%) -0.1% ( -4% - 4%) 0.917
MedPhrase 407.77 (2.4%) 408.15 (2.9%) 0.1% ( -5% - 5%) 0.912
AndHighHighDayTaxoFacets 28.76 (2.0%) 28.81 (1.7%) 0.2% ( -3% - 3%) 0.783
HighIntervalsOrdered 38.00 (7.6%) 38.10 (9.0%) 0.3% ( -15% - 18%) 0.924
HighTermTitleBDVSort 66.71 (8.8%) 67.03 (8.8%) 0.5% ( -15% - 19%) 0.861
MedSpanNear 47.80 (3.1%) 48.05 (3.2%) 0.5% ( -5% - 7%) 0.595
LowIntervalsOrdered 53.59 (3.3%) 53.89 (3.5%) 0.6% ( -6% - 7%) 0.606
AndHighMedDayTaxoFacets 107.62 (2.0%) 108.31 (1.3%) 0.6% ( -2% - 4%) 0.221
OrNotHighMed 1551.89 (3.0%) 1562.23 (3.0%) 0.7% ( -5% - 6%) 0.485
MedSloppyPhrase 177.55 (3.5%) 178.91 (2.5%) 0.8% ( -5% - 6%) 0.427
OrHighMedDayTaxoFacets 9.19 (7.6%) 9.27 (8.0%) 0.9% ( -13% - 17%) 0.723
Fuzzy1 156.05 (3.0%) 157.78 (2.7%) 1.1% ( -4% - 6%) 0.216
MedIntervalsOrdered 81.81 (4.4%) 82.78 (4.5%) 1.2% ( -7% - 10%) 0.400
OrHighNotMed 2064.85 (4.7%) 2091.01 (4.7%) 1.3% ( -7% - 11%) 0.397
HighSpanNear 25.08 (3.7%) 25.42 (4.5%) 1.3% ( -6% - 9%) 0.301
BrowseDayOfYearSSDVFacets 28.09 (12.3%) 28.51 (12.6%) 1.5% ( -20% - 30%) 0.705
LowSpanNear 274.40 (2.3%) 278.54 (2.3%) 1.5% ( -3% - 6%) 0.038
HighPhrase 539.93 (2.8%) 548.73 (2.3%) 1.6% ( -3% - 6%) 0.045
Fuzzy2 119.67 (2.7%) 121.67 (2.6%) 1.7% ( -3% - 7%) 0.044
BrowseRandomLabelSSDVFacets 19.26 (10.1%) 19.65 (9.4%) 2.0% ( -15% - 23%) 0.515
Respell 93.58 (3.8%) 95.47 (2.8%) 2.0% ( -4% - 8%) 0.055
MedTerm 3172.56 (4.2%) 3238.60 (4.0%) 2.1% ( -5% - 10%) 0.107
OrHighNotHigh 1272.14 (5.4%) 1298.96 (4.4%) 2.1% ( -7% - 12%) 0.174
AndHighLow 1336.59 (4.4%) 1365.22 (3.9%) 2.1% ( -5% - 10%) 0.103
LowTerm 3506.73 (4.9%) 3583.07 (4.0%) 2.2% ( -6% - 11%) 0.126
HighTerm 2467.18 (5.1%) 2527.84 (5.0%) 2.5% ( -7% - 13%) 0.123
OrNotHighLow 1842.33 (4.9%) 1888.94 (3.3%) 2.5% ( -5% - 11%) 0.055
OrNotHighHigh 1517.53 (3.4%) 1557.59 (3.6%) 2.6% ( -4% - 10%) 0.018
OrHighNotLow 1759.51 (4.7%) 1808.00 (3.9%) 2.8% ( -5% - 11%) 0.044
Prefix3 1004.83 (6.2%) 1035.73 (5.6%) 3.1% ( -8% - 15%) 0.101
BrowseDateSSDVFacets 4.42 (34.0%) 4.59 (34.6%) 3.9% ( -48% - 109%) 0.721
Wildcard 157.42 (7.6%) 163.60 (6.7%) 3.9% ( -9% - 19%) 0.085
PKLookup 277.42 (6.1%) 289.34 (3.2%) 4.3% ( -4% - 14%) 0.005
OrHighLow 1056.71 (4.0%) 1238.32 (4.2%) 17.2% ( 8% - 26%) 0.000
BrowseMonthTaxoFacets 25.90 (34.9%) 31.19 (47.6%) 20.4% ( -46% - 158%) 0.122
BrowseDateTaxoFacets 26.79 (38.0%) 32.31 (53.2%) 20.6% ( -51% - 180%) 0.158
BrowseDayOfYearTaxoFacets 26.88 (37.8%) 32.46 (53.1%) 20.7% ( -50% - 179%) 0.155
BrowseRandomLabelTaxoFacets 26.25 (48.7%) 33.41 (69.6%) 27.3% ( -61% - 283%) 0.151
OrHighHigh 44.09 (6.6%) 57.48 (5.2%) 30.4% ( 17% - 45%) 0.000
OrHighMed 74.02 (6.5%) 115.40 (6.5%) 55.9% ( 40% - 73%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
Prefix3 1486.03 (5.7%) 1455.45 (6.7%) -2.1% ( -13% - 11%) 0.299
OrHighNotMed 1529.61 (3.6%) 1501.55 (3.6%) -1.8% ( -8% - 5%) 0.104
HighTerm 4492.83 (3.9%) 4420.29 (4.2%) -1.6% ( -9% - 6%) 0.209
BrowseRandomLabelSSDVFacets 20.49 (4.6%) 20.27 (6.3%) -1.1% ( -11% - 10%) 0.542
MedPhrase 167.60 (1.8%) 165.87 (1.8%) -1.0% ( -4% - 2%) 0.067
OrHighNotHigh 1474.90 (3.8%) 1460.19 (3.5%) -1.0% ( -7% - 6%) 0.387
Fuzzy2 111.64 (1.5%) 110.58 (2.2%) -0.9% ( -4% - 2%) 0.113
OrHighNotLow 1609.18 (3.2%) 1594.45 (4.4%) -0.9% ( -8% - 6%) 0.451
HighIntervalsOrdered 23.50 (7.4%) 23.29 (6.9%) -0.9% ( -14% - 14%) 0.699
MedIntervalsOrdered 154.52 (7.7%) 153.18 (7.3%) -0.9% ( -14% - 15%) 0.713
MedTermDayTaxoFacets 30.57 (5.6%) 30.32 (6.2%) -0.8% ( -12% - 11%) 0.668
LowPhrase 198.30 (2.3%) 196.85 (2.3%) -0.7% ( -5% - 4%) 0.324
HighPhrase 775.57 (2.4%) 770.08 (2.2%) -0.7% ( -5% - 4%) 0.337
OrNotHighHigh 1399.97 (3.9%) 1390.71 (3.5%) -0.7% ( -7% - 7%) 0.574
HighTermTitleBDVSort 186.02 (7.2%) 184.86 (7.4%) -0.6% ( -14% - 15%) 0.786
Wildcard 179.63 (3.2%) 178.77 (3.7%) -0.5% ( -7% - 6%) 0.664
PKLookup 290.46 (3.1%) 289.10 (3.6%) -0.5% ( -6% - 6%) 0.659
OrNotHighMed 1461.62 (3.8%) 1454.97 (3.5%) -0.5% ( -7% - 7%) 0.692
TermDTSort 263.72 (7.4%) 262.56 (7.6%) -0.4% ( -14% - 15%) 0.853
LowIntervalsOrdered 125.12 (4.5%) 124.60 (4.4%) -0.4% ( -8% - 8%) 0.764
LowSloppyPhrase 120.77 (1.9%) 120.30 (1.9%) -0.4% ( -4% - 3%) 0.517
AndHighMed 272.51 (5.0%) 271.52 (5.2%) -0.4% ( -10% - 10%) 0.822
MedSloppyPhrase 34.39 (1.8%) 34.27 (1.7%) -0.3% ( -3% - 3%) 0.539
Fuzzy1 145.80 (1.2%) 145.31 (2.3%) -0.3% ( -3% - 3%) 0.559
AndHighMedDayTaxoFacets 36.92 (3.1%) 36.83 (3.2%) -0.3% ( -6% - 6%) 0.801
AndHighHighDayTaxoFacets 41.21 (2.7%) 41.11 (2.4%) -0.2% ( -5% - 4%) 0.772
HighSloppyPhrase 41.09 (2.7%) 41.00 (3.8%) -0.2% ( -6% - 6%) 0.830
MedTerm 3362.24 (3.7%) 3356.21 (3.7%) -0.2% ( -7% - 7%) 0.877
LowTerm 5102.13 (4.2%) 5097.31 (4.6%) -0.1% ( -8% - 9%) 0.946
OrNotHighLow 1842.51 (4.2%) 1841.20 (4.3%) -0.1% ( -8% - 8%) 0.957
HighSpanNear 9.51 (4.6%) 9.52 (5.1%) 0.1% ( -9% - 10%) 0.974
MedSpanNear 97.41 (2.7%) 97.48 (3.2%) 0.1% ( -5% - 6%) 0.937
OrHighMedDayTaxoFacets 8.33 (4.5%) 8.34 (5.3%) 0.1% ( -9% - 10%) 0.960
BrowseMonthSSDVFacets 29.36 (8.1%) 29.39 (8.4%) 0.1% ( -15% - 17%) 0.972
Respell 104.50 (0.9%) 104.71 (2.3%) 0.2% ( -2% - 3%) 0.720
AndHighLow 1632.66 (5.2%) 1636.07 (5.0%) 0.2% ( -9% - 10%) 0.897
AndHighHigh 84.25 (4.5%) 84.47 (3.9%) 0.3% ( -7% - 9%) 0.845
BrowseDayOfYearSSDVFacets 26.66 (9.6%) 26.85 (9.0%) 0.7% ( -16% - 21%) 0.805
LowSpanNear 272.01 (3.9%) 274.66 (5.0%) 1.0% ( -7% - 10%) 0.490
HighTermMonthSort 159.98 (7.4%) 161.76 (11.6%) 1.1% ( -16% - 21%) 0.717
HighTermDayOfYearSort 195.88 (7.0%) 200.34 (8.2%) 2.3% ( -12% - 18%) 0.344
BrowseRandomLabelTaxoFacets 32.84 (52.5%) 33.70 (50.8%) 2.6% ( -65% - 222%) 0.872
BrowseDateTaxoFacets 32.24 (41.3%) 33.31 (41.1%) 3.3% ( -55% - 146%) 0.798
BrowseDayOfYearTaxoFacets 32.26 (41.0%) 33.41 (41.0%) 3.5% ( -55% - 144%) 0.784
BrowseDateSSDVFacets 4.17 (35.7%) 4.41 (37.0%) 5.9% ( -49% - 122%) 0.610
BrowseMonthTaxoFacets 31.23 (39.5%) 33.13 (41.8%) 6.1% ( -53% - 144%) 0.636
IntNRQ 131.85 (17.7%) 140.08 (10.5%) 6.2% ( -18% - 41%) 0.174
OrHighMed 158.82 (7.1%) 198.40 (6.0%) 24.9% ( 11% - 40%) 0.000
OrHighHigh 38.33 (7.5%) 63.25 (8.8%) 65.0% ( 45% - 87%) 0.000
OrHighLow 571.04 (4.1%) 1341.58 (10.0%) 134.9% ( 116% - 155%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
BrowseRandomLabelTaxoFacets 33.86 (55.5%) 33.01 (49.5%) -2.5% ( -69% - 230%) 0.879
AndHighLow 1590.63 (3.8%) 1557.92 (5.3%) -2.1% ( -10% - 7%) 0.160
BrowseDayOfYearSSDVFacets 26.91 (4.4%) 26.46 (6.2%) -1.7% ( -11% - 9%) 0.320
BrowseMonthSSDVFacets 29.07 (7.5%) 28.61 (9.0%) -1.6% ( -16% - 16%) 0.545
OrHighLow 1123.52 (4.4%) 1110.16 (3.2%) -1.2% ( -8% - 6%) 0.330
HighPhrase 424.08 (1.7%) 419.21 (2.9%) -1.1% ( -5% - 3%) 0.126
LowPhrase 150.69 (2.1%) 149.27 (2.2%) -0.9% ( -5% - 3%) 0.156
BrowseMonthTaxoFacets 31.95 (38.9%) 31.66 (40.3%) -0.9% ( -57% - 128%) 0.943
OrNotHighHigh 1478.78 (3.1%) 1469.38 (3.0%) -0.6% ( -6% - 5%) 0.514
HighSpanNear 9.03 (3.6%) 8.98 (3.1%) -0.5% ( -6% - 6%) 0.648
AndHighMed 236.20 (3.1%) 235.23 (3.8%) -0.4% ( -7% - 6%) 0.707
OrHighNotHigh 1404.82 (3.0%) 1399.84 (2.9%) -0.4% ( -6% - 5%) 0.704
MedPhrase 1408.93 (2.3%) 1404.10 (2.6%) -0.3% ( -5% - 4%) 0.660
MedSloppyPhrase 54.44 (2.6%) 54.27 (1.8%) -0.3% ( -4% - 4%) 0.668
Prefix3 88.47 (8.9%) 88.53 (7.0%) 0.1% ( -14% - 17%) 0.977
LowSloppyPhrase 133.40 (3.9%) 133.50 (3.5%) 0.1% ( -7% - 7%) 0.945
LowSpanNear 58.78 (2.2%) 58.84 (1.4%) 0.1% ( -3% - 3%) 0.859
MedSpanNear 27.68 (2.6%) 27.71 (2.1%) 0.1% ( -4% - 4%) 0.889
HighIntervalsOrdered 19.60 (7.8%) 19.64 (9.5%) 0.2% ( -15% - 18%) 0.937
MedTermDayTaxoFacets 67.02 (3.7%) 67.26 (3.3%) 0.4% ( -6% - 7%) 0.750
MedIntervalsOrdered 36.93 (3.4%) 37.09 (3.7%) 0.4% ( -6% - 7%) 0.709
LowIntervalsOrdered 18.28 (3.4%) 18.37 (3.3%) 0.5% ( -5% - 7%) 0.652
HighTerm 2544.46 (4.5%) 2559.83 (4.1%) 0.6% ( -7% - 9%) 0.655
Wildcard 67.00 (4.6%) 67.45 (4.5%) 0.7% ( -8% - 10%) 0.642
PKLookup 285.72 (4.4%) 287.90 (3.9%) 0.8% ( -7% - 9%) 0.566
OrNotHighMed 1769.53 (2.6%) 1783.12 (2.9%) 0.8% ( -4% - 6%) 0.384
BrowseDateTaxoFacets 32.54 (43.7%) 32.79 (40.8%) 0.8% ( -58% - 151%) 0.954
AndHighHighDayTaxoFacets 16.31 (2.7%) 16.44 (2.3%) 0.8% ( -4% - 6%) 0.331
Respell 90.52 (3.5%) 91.24 (2.7%) 0.8% ( -5% - 7%) 0.417
AndHighMedDayTaxoFacets 91.12 (1.6%) 91.87 (1.2%) 0.8% ( -1% - 3%) 0.068
BrowseDayOfYearTaxoFacets 32.59 (43.4%) 32.87 (40.7%) 0.9% ( -58% - 150%) 0.949
OrHighNotMed 1303.02 (3.7%) 1314.61 (3.8%) 0.9% ( -6% - 8%) 0.457
OrNotHighLow 1526.42 (2.6%) 1540.29 (3.3%) 0.9% ( -4% - 7%) 0.338
OrHighNotLow 1872.15 (3.2%) 1889.45 (4.0%) 0.9% ( -6% - 8%) 0.422
HighSloppyPhrase 43.82 (2.0%) 44.25 (2.9%) 1.0% ( -3% - 6%) 0.203
MedTerm 2719.77 (3.3%) 2747.44 (5.0%) 1.0% ( -7% - 9%) 0.446
Fuzzy1 141.59 (2.7%) 143.05 (2.7%) 1.0% ( -4% - 6%) 0.224
AndHighHigh 132.53 (2.8%) 133.91 (3.0%) 1.0% ( -4% - 7%) 0.261
LowTerm 3170.39 (4.0%) 3215.56 (3.7%) 1.4% ( -6% - 9%) 0.243
Fuzzy2 102.17 (2.5%) 103.81 (2.7%) 1.6% ( -3% - 6%) 0.051
IntNRQ 229.02 (7.8%) 232.85 (4.9%) 1.7% ( -10% - 15%) 0.415
BrowseRandomLabelSSDVFacets 20.62 (8.0%) 21.00 (9.4%) 1.8% ( -14% - 20%) 0.505
HighTermDayOfYearSort 105.81 (7.9%) 108.08 (9.1%) 2.1% ( -13% - 20%) 0.428
OrHighMedDayTaxoFacets 15.57 (7.4%) 16.05 (8.8%) 3.1% ( -12% - 20%) 0.226
HighTermTitleBDVSort 282.98 (3.1%) 291.81 (8.1%) 3.1% ( -7% - 14%) 0.107
TermDTSort 208.95 (3.3%) 215.60 (8.1%) 3.2% ( -7% - 15%) 0.104
HighTermMonthSort 278.35 (3.2%) 293.92 (9.6%) 5.6% ( -7% - 19%) 0.014
BrowseDateSSDVFacets 4.10 (31.4%) 4.40 (34.8%) 7.1% ( -44% - 106%) 0.497
OrHighHigh 44.28 (6.0%) 49.24 (5.9%) 11.2% ( 0% - 24%) 0.000
OrHighMed 96.24 (6.4%) 257.56 (11.3%) 167.6% ( 140% - 197%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighPhraseHighPhrase 26.43 (11.3%) 22.85 (6.3%) -13.6% ( -27% - 4%) 0.000
AndHighOrMedMed 93.60 (10.5%) 94.62 (7.9%) 1.1% ( -15% - 21%) 0.711
PKLookup 213.78 (10.3%) 219.68 (12.4%) 2.8% ( -18% - 28%) 0.444
AndMedOrHighHigh 72.96 (12.7%) 76.81 (7.9%) 5.3% ( -13% - 29%) 0.115
OrAndHigMedAndHighMed 117.08 (11.1%) 128.52 (9.3%) 9.8% ( -9% - 33%) 0.003
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighPhraseHighPhrase 27.43 (13.1%) 24.07 (4.3%) -12.3% ( -26% - 5%) 0.000
AndHighOrMedMed 96.00 (12.8%) 99.20 (4.0%) 3.3% ( -11% - 23%) 0.265
PKLookup 212.67 (18.0%) 224.37 (14.3%) 5.5% ( -22% - 46%) 0.284
AndMedOrHighHigh 75.19 (16.1%) 80.60 (4.2%) 7.2% ( -11% - 32%) 0.052
OrAndHigMedAndHighMed 120.29 (14.5%) 135.06 (4.9%) 12.3% ( -6% - 36%) 0.000
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1164436120
@zacharymorn FYI I played with a slightly different approach that implements BMM as a bulk scorer instead of a scorer, which I was hoping would help with making bookkeeping more lightweight: https://github.com/jpountz/lucene/tree/maxscore. It could be interesting to compare with your implementation.
One optimization it has that seemed to help that your scorer doesn't have is to check for every non-essential scorer whether the score obtained so far plus the sum of max scores of non essential scorers that haven't been checked yet is still competitive.
I got the following results on one run on wikimedium10m:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighNotLow 1493.13 (6.5%) 1445.29 (5.1%) -3.2% ( -13% - 8%) 0.083
OrNotHighMed 1410.19 (3.8%) 1373.37 (3.1%) -2.6% ( -9% - 4%) 0.017
OrNotHighHigh 1057.88 (5.1%) 1031.19 (4.4%) -2.5% ( -11% - 7%) 0.096
OrHighNotMed 1525.10 (5.2%) 1486.80 (4.4%) -2.5% ( -11% - 7%) 0.098
OrHighNotHigh 1250.31 (4.3%) 1221.99 (3.4%) -2.3% ( -9% - 5%) 0.062
IntNRQ 531.54 (2.9%) 522.49 (2.7%) -1.7% ( -7% - 3%) 0.053
Fuzzy1 111.13 (2.1%) 109.80 (2.6%) -1.2% ( -5% - 3%) 0.107
AndHighMed 386.29 (4.1%) 381.84 (3.3%) -1.2% ( -8% - 6%) 0.329
AndHighHigh 78.96 (5.6%) 78.18 (4.7%) -1.0% ( -10% - 9%) 0.548
BrowseDateSSDVFacets 4.51 (12.6%) 4.47 (12.4%) -0.8% ( -22% - 27%) 0.836
OrNotHighLow 1316.24 (3.8%) 1305.93 (3.1%) -0.8% ( -7% - 6%) 0.476
OrHighMedDayTaxoFacets 20.87 (5.1%) 20.71 (4.2%) -0.8% ( -9% - 9%) 0.609
BrowseMonthSSDVFacets 23.54 (6.4%) 23.42 (7.4%) -0.5% ( -13% - 14%) 0.817
BrowseRandomLabelTaxoFacets 37.54 (1.7%) 37.37 (1.9%) -0.5% ( -4% - 3%) 0.432
MedSpanNear 68.68 (1.7%) 68.37 (2.2%) -0.4% ( -4% - 3%) 0.474
AndHighHighDayTaxoFacets 10.78 (5.9%) 10.73 (4.7%) -0.4% ( -10% - 10%) 0.794
BrowseMonthTaxoFacets 28.39 (10.0%) 28.29 (9.1%) -0.3% ( -17% - 20%) 0.910
HighTermDayOfYearSort 171.78 (13.7%) 171.22 (13.2%) -0.3% ( -23% - 30%) 0.939
PKLookup 245.27 (2.2%) 244.52 (1.9%) -0.3% ( -4% - 3%) 0.635
HighSloppyPhrase 39.08 (2.9%) 38.96 (4.3%) -0.3% ( -7% - 7%) 0.795
HighTermMonthSort 167.47 (15.1%) 167.06 (14.7%) -0.2% ( -26% - 34%) 0.959
HighPhrase 250.14 (2.8%) 249.53 (2.3%) -0.2% ( -5% - 5%) 0.767
TermDTSort 138.22 (14.0%) 137.97 (13.4%) -0.2% ( -24% - 31%) 0.967
Fuzzy2 55.22 (1.6%) 55.17 (1.5%) -0.1% ( -3% - 3%) 0.837
MedTerm 1844.25 (6.4%) 1843.10 (4.9%) -0.1% ( -10% - 11%) 0.972
MedSloppyPhrase 15.34 (2.2%) 15.33 (3.9%) -0.1% ( -5% - 6%) 0.954
Prefix3 110.03 (2.6%) 110.07 (1.8%) 0.0% ( -4% - 4%) 0.962
HighSpanNear 7.95 (1.7%) 7.97 (1.7%) 0.2% ( -3% - 3%) 0.772
BrowseDayOfYearTaxoFacets 46.78 (1.9%) 46.86 (2.1%) 0.2% ( -3% - 4%) 0.788
AndHighLow 1291.99 (2.6%) 1294.28 (3.4%) 0.2% ( -5% - 6%) 0.854
LowSpanNear 47.55 (1.5%) 47.64 (1.4%) 0.2% ( -2% - 3%) 0.697
Wildcard 157.83 (1.5%) 158.14 (1.3%) 0.2% ( -2% - 3%) 0.661
LowPhrase 83.20 (2.3%) 83.37 (2.1%) 0.2% ( -4% - 4%) 0.773
Respell 95.18 (1.4%) 95.47 (1.3%) 0.3% ( -2% - 3%) 0.492
AndHighMedDayTaxoFacets 51.97 (1.8%) 52.16 (2.1%) 0.4% ( -3% - 4%) 0.553
BrowseDateTaxoFacets 45.77 (2.0%) 45.98 (1.9%) 0.5% ( -3% - 4%) 0.452
MedTermDayTaxoFacets 60.66 (5.9%) 61.03 (5.0%) 0.6% ( -9% - 12%) 0.718
MedPhrase 57.67 (3.1%) 58.06 (2.6%) 0.7% ( -4% - 6%) 0.452
BrowseDayOfYearSSDVFacets 20.40 (6.0%) 20.57 (4.2%) 0.8% ( -8% - 11%) 0.609
LowSloppyPhrase 37.59 (4.0%) 38.00 (3.6%) 1.1% ( -6% - 9%) 0.376
BrowseRandomLabelSSDVFacets 15.25 (5.2%) 15.41 (6.9%) 1.1% ( -10% - 13%) 0.571
HighTerm 2001.23 (6.4%) 2025.82 (4.9%) 1.2% ( -9% - 13%) 0.493
LowTerm 2092.97 (4.3%) 2119.02 (5.5%) 1.2% ( -8% - 11%) 0.423
MedIntervalsOrdered 56.91 (3.9%) 57.92 (3.0%) 1.8% ( -4% - 9%) 0.107
HighIntervalsOrdered 16.67 (6.2%) 16.97 (4.6%) 1.8% ( -8% - 13%) 0.297
LowIntervalsOrdered 20.18 (4.3%) 20.57 (3.3%) 1.9% ( -5% - 10%) 0.113
HighTermTitleBDVSort 182.32 (14.2%) 186.92 (22.0%) 2.5% ( -29% - 45%) 0.667
OrHighLow 1235.23 (1.8%) 1484.12 (4.8%) 20.1% ( 13% - 27%) 0.000
OrHighMed 156.75 (4.7%) 200.46 (4.7%) 27.9% ( 17% - 39%) 0.000
OrHighHigh 25.07 (5.2%) 48.30 (9.1%) 92.6% ( 74% - 112%) 0.000
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1165218655
Thanks @jpountz for the suggestion and also providing the bulk scorer implementation! The result looks pretty impressive as well!
I just tried `taskRepeatCount=200` with my implementation, and although it did make the results more stable across runs, the speed up with full tasks was still no where near that from just running the three disjunction tasks. I also pulled your version and ran through the same set of tests above, and the speed up were both good and stable (which rules out issue from the util side I guess). I will study your implementation further and see if mine can be improved accordingly.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1170684358
> With this change, I suspect that some scorers created in `TestWANDScorer` would now use your new `BlockMaxMaxScoreScorer`, which is going to decrease the coverage of WANDScorer. Can we somehow make sure that `TestWANDScorer` always gets a `WANDScorer`? E.g. I spotted this query under `TestWANDScorer#testBasics` which likely uses your now scorer:
>
> ```java
> // test a filtered disjunction
> query =
> new BooleanQuery.Builder()
> .add(
> new BooleanQuery.Builder()
> .add(
> new BoostQuery(
> new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2),
> Occur.SHOULD)
> .add(new ConstantScoreQuery(new TermQuery(new Term("foo", "B"))), Occur.SHOULD)
> .build(),
> Occur.MUST)
> .add(new TermQuery(new Term("foo", "C")), Occur.FILTER)
> .build();
> ```
Yeah this is a good question. In my newly added tests I have used something like this to confirm it's testing the right scorer, but I'm not totally happy about this approach myself :
```
if (scorer instanceof AssertingScorer) {
assertTrue(((AssertingScorer) scorer).getIn() instanceof BlockMaxMaxscoreScorer);
} else {
assertTrue(scorer instanceof BlockMaxMaxscoreScorer);
}
```
One alternative approach could be instantiating `WANDScorer` directly inside the test for lower level tests, and moving the higher level tests into another test class that doesn't care about the specific scorer implementation for disjunction? This may require duplicating some code from `BooleanWeight`, `AssertingWeight` etc though but should be do-able.
On the other hand, if we don't plan on initiating `WANDScorer` directly in the test, varying the query clauses and asserting like above might be the best we could do I feel? This has the potential test coverage decrease issue as you suggested so may not be ideal either.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1164220705
The fact that queries perform slower in general in your first benchmark run makes me wonder if this could be due to insufficient warmup time. The default task repeat count of 20 might be too low for these queries that are very good at skipping irrelevant documents. Maybe try passing `taskRepeatCount=100` in the ctor of your `Competition` object? Does it make queries run closer in terms of performance to your second benchmark run?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1163861983
Thanks @jpountz for looking into this! I did further experiments on this and the result seems to suggest it may be caused by bug / caching in the util or lucene itself.
What I did was I first only kept 1 query per pure disjunction task, and remove the rest of the tasks like below.
```
OrHighHigh: several following # freq=436129 freq=416515
OrHighMed: international chris # freq=418261 freq=85523
OrHighLow: 2005 valois # freq=835460 freq=2277
```
and got this result:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
PKLookup 189.27 (23.2%) 220.18 (16.1%) 16.3% ( -18% - 72%) 0.010
OrHighHigh 10.37 (41.8%) 20.92 (94.9%) 101.8% ( -24% - 410%) 0.000
OrHighMed 21.43 (54.3%) 56.18 (138.5%) 162.2% ( -19% - 777%) 0.000
OrHighLow 138.14 (26.2%) 368.50 (91.9%) 166.7% ( 38% - 385%) 0.000
```
However, when I added back the rest of the tasks but still kept 1 query for each of the three disjunction tasks, I got vastly different results:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighHigh 48.31 (7.8%) 38.21 (5.6%) -20.9% ( -31% - -8%) 0.000
OrHighMed 152.23 (8.5%) 140.16 (8.5%) -7.9% ( -23% - 9%) 0.003
BrowseDayOfYearSSDVFacets 21.67 (12.4%) 20.84 (7.1%) -3.8% ( -20% - 17%) 0.231
MedPhrase 103.08 (5.3%) 102.43 (9.1%) -0.6% ( -14% - 14%) 0.790
TermDTSort 162.97 (10.6%) 162.10 (6.4%) -0.5% ( -15% - 18%) 0.847
HighSloppyPhrase 62.12 (7.3%) 61.99 (5.7%) -0.2% ( -12% - 13%) 0.921
OrHighNotMed 1216.05 (4.1%) 1216.03 (3.2%) -0.0% ( -7% - 7%) 0.999
HighTerm 2088.45 (4.2%) 2091.84 (3.5%) 0.2% ( -7% - 8%) 0.895
BrowseMonthSSDVFacets 23.34 (10.0%) 23.49 (11.0%) 0.6% ( -18% - 24%) 0.845
MedTerm 3189.76 (3.5%) 3215.19 (3.9%) 0.8% ( -6% - 8%) 0.497
AndHighHigh 59.64 (6.9%) 60.14 (6.3%) 0.8% ( -11% - 15%) 0.688
MedSpanNear 46.86 (5.9%) 47.26 (7.8%) 0.9% ( -12% - 15%) 0.692
HighTermTitleBDVSort 125.65 (7.5%) 126.83 (10.1%) 0.9% ( -15% - 20%) 0.738
LowPhrase 21.25 (4.2%) 21.62 (4.4%) 1.8% ( -6% - 10%) 0.194
BrowseRandomLabelSSDVFacets 15.38 (6.6%) 15.66 (10.1%) 1.8% ( -13% - 19%) 0.509
HighSpanNear 20.48 (5.6%) 20.86 (5.1%) 1.9% ( -8% - 13%) 0.270
HighTermDayOfYearSort 187.51 (7.6%) 191.07 (9.4%) 1.9% ( -14% - 20%) 0.482
OrHighNotLow 1505.18 (10.5%) 1535.43 (4.4%) 2.0% ( -11% - 18%) 0.431
LowSloppyPhrase 233.02 (6.9%) 237.95 (8.4%) 2.1% ( -12% - 18%) 0.383
MedIntervalsOrdered 18.37 (5.0%) 18.77 (5.3%) 2.2% ( -7% - 13%) 0.177
OrNotHighMed 1310.81 (4.0%) 1342.33 (5.1%) 2.4% ( -6% - 11%) 0.096
MedSloppyPhrase 40.20 (6.2%) 41.18 (5.7%) 2.4% ( -8% - 15%) 0.190
AndHighHighDayTaxoFacets 13.90 (5.6%) 14.25 (3.7%) 2.6% ( -6% - 12%) 0.090
AndHighMed 566.84 (5.8%) 582.07 (5.7%) 2.7% ( -8% - 15%) 0.138
OrNotHighHigh 1976.63 (5.0%) 2030.44 (3.5%) 2.7% ( -5% - 11%) 0.044
Fuzzy2 50.72 (10.3%) 52.17 (7.9%) 2.9% ( -13% - 23%) 0.325
MedTermDayTaxoFacets 79.53 (6.7%) 81.86 (7.5%) 2.9% ( -10% - 18%) 0.192
OrHighNotHigh 1169.61 (6.6%) 1204.25 (3.8%) 3.0% ( -6% - 14%) 0.080
HighPhrase 400.95 (5.5%) 413.11 (2.2%) 3.0% ( -4% - 11%) 0.022
OrHighMedDayTaxoFacets 13.10 (5.0%) 13.50 (5.6%) 3.1% ( -7% - 14%) 0.066
AndHighMedDayTaxoFacets 52.75 (6.4%) 54.42 (6.3%) 3.2% ( -8% - 16%) 0.115
OrNotHighLow 2842.51 (3.5%) 2935.45 (5.5%) 3.3% ( -5% - 12%) 0.025
LowTerm 3032.44 (3.7%) 3140.83 (3.0%) 3.6% ( -3% - 10%) 0.001
HighTermMonthSort 210.58 (11.4%) 218.63 (11.3%) 3.8% ( -16% - 29%) 0.286
Fuzzy1 135.95 (8.1%) 141.35 (9.6%) 4.0% ( -12% - 23%) 0.158
IntNRQ 365.48 (17.1%) 380.05 (12.1%) 4.0% ( -21% - 40%) 0.395
Prefix3 72.71 (11.4%) 75.69 (9.5%) 4.1% ( -15% - 28%) 0.217
Wildcard 333.86 (9.4%) 347.59 (10.5%) 4.1% ( -14% - 26%) 0.192
LowSpanNear 40.56 (4.1%) 42.49 (6.4%) 4.8% ( -5% - 15%) 0.005
Respell 56.68 (7.6%) 59.68 (5.5%) 5.3% ( -7% - 19%) 0.012
AndHighLow 1062.10 (6.2%) 1118.34 (5.9%) 5.3% ( -6% - 18%) 0.005
HighIntervalsOrdered 9.10 (4.5%) 9.64 (6.5%) 5.9% ( -4% - 17%) 0.001
LowIntervalsOrdered 15.02 (5.1%) 15.96 (5.2%) 6.2% ( -3% - 17%) 0.000
BrowseDateSSDVFacets 2.29 (6.5%) 2.44 (19.1%) 6.4% ( -18% - 34%) 0.158
OrHighLow 931.34 (4.9%) 995.65 (3.7%) 6.9% ( -1% - 16%) 0.000
PKLookup 244.93 (10.0%) 262.86 (11.4%) 7.3% ( -12% - 31%) 0.031
BrowseMonthTaxoFacets 23.30 (42.2%) 28.97 (50.4%) 24.3% ( -47% - 202%) 0.098
BrowseDateTaxoFacets 23.39 (44.9%) 29.54 (52.5%) 26.3% ( -49% - 224%) 0.089
BrowseDayOfYearTaxoFacets 23.38 (44.2%) 30.09 (54.9%) 28.7% ( -48% - 228%) 0.069
BrowseRandomLabelTaxoFacets 22.47 (55.6%) 29.49 (68.4%) 31.2% ( -59% - 349%) 0.113
```
I've attached the tasks file for reference here as well
[wikimedium.10M.nostopwords.tasks.txt](https://github.com/apache/lucene/files/8962486/wikimedium.10M.nostopwords.tasks.txt)
@mikemccand , do you have any suggestion where this discrepancy might be coming from? I'll continue to run experiments as well to see if I can pinpoint the issue.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r911590359
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
Review Comment:
I've created a follow-up issue for this https://issues.apache.org/jira/browse/LUCENE-10636.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r910551829
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
Review Comment:
Sorry this was an old comment from previous (last year) experimentation. I've updated it and also kept it flexible for future stats change.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r910551704
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
Review Comment:
Moved `upTo` as well as a few others into constructor.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
Review Comment:
Updated.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
Review Comment:
Ah yes updated.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
Review Comment:
Updated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1166188875
Alright. As it turns out, the reason I'm getting vastly different performance results as I change tasks file here https://github.com/apache/lucene/pull/972#issuecomment-1163861983 is that, I have previously configured `SEARCH_NUM_THREADS = 10` in my `localconstants.py` and my changes somehow work better when more threads are executing disjunction queries at the same time. When the tasks file is unmodified and has more than 10 query tasks, the disjunction tasks get executed effectively serially; but when the tasks file only has 3 disjunction tasks, each task is executed multi-threaded.
Here are the results with only 3 disjunction tasks with 3 queries each, but with different number of search threads:
`SEARCH_NUM_THREADS = 10`
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
PKLookup 197.09 (18.1%) 202.07 (20.2%) 2.5% ( -30% - 49%) 0.678
OrHighLow 217.92 (28.3%) 433.05 (46.3%) 98.7% ( 18% - 241%) 0.000
OrHighHigh 12.53 (48.0%) 27.27 (64.9%) 117.7% ( 3% - 443%) 0.000
OrHighMed 14.09 (50.2%) 40.60 (74.8%) 188.1% ( 42% - 628%) 0.000
```
`SEARCH_NUM_THREADS = 2`
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighLow 912.29 (12.1%) 903.58 (8.2%) -1.0% ( -18% - 22%) 0.771
PKLookup 252.26 (15.0%) 260.21 (15.5%) 3.2% ( -23% - 39%) 0.514
OrHighMed 70.16 (11.8%) 84.99 (8.9%) 21.1% ( 0% - 47%) 0.000
OrHighHigh 38.24 (11.9%) 49.09 (10.7%) 28.4% ( 5% - 57%) 0.000
```
Just curious @jpountz , were you using the default `SEARCH_NUM_THREADS = 2` during your benchmark?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r908131031
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
Review Comment:
Other boolean queries sum up scores into a `double` to reduce the impact of rounding on accuracy, though this only helps when there are 3 clauses or more, otherwise summing up into a double doesn't yield better accuracy. Can you either add a comment that this would need to be changed if we were to support more than two clauses with this scorer, or replace it with a double?
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
Review Comment:
This doesn't look correct to me. The overall score could still be above the `minCompetitiveScore` after summing up scores of the non-essential clauses? Should it be something like below:
```java
if (matchedMaxScoreSum + nonEssentialMaxScoreSum < minCompetitiveScore) {
```
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
+ w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+ } else {
+ // This scorer won't be able to contribute to match for target, setting its maxScore
+ // to 0 so it goes into nonEssentialList
+ w.maxScoreFloat = 0;
+ }
+ }
+ }
+
+ private void repartitionLists() {
+ essentialsScorers.clear();
+ maxScoreSortedEssentialScorers.clear();
+ Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxScoreFloat));
+
+ // Re-partition the scorers into non-essential list and essential list, as defined in
+ // the "Optimizing Top-k Document Retrieval Strategies for Block-Max Indexes" paper.
+ nonEssentialMaxScoreSum = 0;
+ for (DisiWrapper w : allScorers) {
+ if (nonEssentialMaxScoreSum + w.maxScoreFloat < minCompetitiveScore) {
+ nonEssentialMaxScoreSum += w.maxScoreFloat;
+ } else {
+ maxScoreSortedEssentialScorers.add(w);
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ @Override
+ public long cost() {
+ // fixed at initialization
+ return cost;
+ }
+ };
+
+ return new TwoPhaseIterator(approximation) {
+ @Override
+ public boolean matches() throws IOException {
+ return score() >= minCompetitiveScore;
Review Comment:
Implementing `matches()` like that has the good side effect that we only pass good matches to the collector, but the bad side-effect that scores are computed twice for competitive hits. I wonder if we should try to cache the score so that it's not computed another time by the collector.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
Review Comment:
I would add a comment that taking the `max` is probably a good approach for 2 clauses but might need more thinking if we want this scorer to perform well with more than 2 clauses.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
Review Comment:
We don't seem to actually need the `scoreMode`, let's remove it and make sure `Boolean2ScorerSupplier` only uses this scorer for TOP_SCORES?
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
+ w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+ } else {
+ // This scorer won't be able to contribute to match for target, setting its maxScore
+ // to 0 so it goes into nonEssentialList
+ w.maxScoreFloat = 0;
+ }
+ }
+ }
+
+ private void repartitionLists() {
+ essentialsScorers.clear();
+ maxScoreSortedEssentialScorers.clear();
+ Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxScoreFloat));
+
+ // Re-partition the scorers into non-essential list and essential list, as defined in
+ // the "Optimizing Top-k Document Retrieval Strategies for Block-Max Indexes" paper.
+ nonEssentialMaxScoreSum = 0;
+ for (DisiWrapper w : allScorers) {
+ if (nonEssentialMaxScoreSum + w.maxScoreFloat < minCompetitiveScore) {
+ nonEssentialMaxScoreSum += w.maxScoreFloat;
+ } else {
+ maxScoreSortedEssentialScorers.add(w);
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ @Override
+ public long cost() {
+ // fixed at initialization
+ return cost;
+ }
+ };
+
+ return new TwoPhaseIterator(approximation) {
+ @Override
+ public boolean matches() throws IOException {
+ return score() >= minCompetitiveScore;
+ }
+
+ @Override
+ public float matchCost() {
+ // maximum number of scorer that matches() might advance
+ return allScorers.length - essentialsScorers.size();
Review Comment:
`matchCost` is expected to be fixed, we shouldn't try to adjust it based on the number of essential scorers, see e.g. how `ConjunctionDISI` uses it to sort two-phase iterators up-front.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r909131198
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
Review Comment:
Done.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
+ w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+ } else {
+ // This scorer won't be able to contribute to match for target, setting its maxScore
+ // to 0 so it goes into nonEssentialList
+ w.maxScoreFloat = 0;
+ }
+ }
+ }
+
+ private void repartitionLists() {
+ essentialsScorers.clear();
+ maxScoreSortedEssentialScorers.clear();
+ Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxScoreFloat));
+
+ // Re-partition the scorers into non-essential list and essential list, as defined in
+ // the "Optimizing Top-k Document Retrieval Strategies for Block-Max Indexes" paper.
+ nonEssentialMaxScoreSum = 0;
+ for (DisiWrapper w : allScorers) {
+ if (nonEssentialMaxScoreSum + w.maxScoreFloat < minCompetitiveScore) {
+ nonEssentialMaxScoreSum += w.maxScoreFloat;
+ } else {
+ maxScoreSortedEssentialScorers.add(w);
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ @Override
+ public long cost() {
+ // fixed at initialization
+ return cost;
+ }
+ };
+
+ return new TwoPhaseIterator(approximation) {
+ @Override
+ public boolean matches() throws IOException {
+ return score() >= minCompetitiveScore;
Review Comment:
Ah good catch. I've updated it to cache the calculation.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r908672681
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
Review Comment:
Thanks for clarifying, I was confused indeed by the name, which suggested to me that this score only included clauses that we "matched" so far.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn merged pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn merged PR #972:
URL: https://github.com/apache/lucene/pull/972
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r911579245
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
Review Comment:
Updated.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private double nonEssentialMaxScoreSum;
+
+ private long cost;
Review Comment:
Updated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r909281863
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
Review Comment:
Nit: it's inconsistent that `upTo` gets initialized here while `doc` is initialized in the constructor.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
Review Comment:
we could probably cache the score that is computed here so that we don't re-compute it later in score(), I'm happy to leave it for a follow-up, it's up to you
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
Review Comment:
Make it a double since it's a sum of scores?
##########
lucene/core/src/java/org/apache/lucene/search/WANDScorer.java:
##########
@@ -106,7 +106,7 @@ static long scaleMaxScore(float maxScore, int scalingFactor) {
* Scale min competitive scores the same way as max scores but this time by rounding down in order
* to make sure that we do not miss any matches.
*/
- private static long scaleMinScore(float minScore, int scalingFactor) {
+ static long scaleMinScore(float minScore, int scalingFactor) {
Review Comment:
we don't seem to need to make this method pkg-private anymore
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
Review Comment:
We could do this so that it would work with any number of clauses.
```suggestion
if (maxScoreSumPropagator.scoreSumUpperBound(docScoreUpperBound) < minCompetitiveScore) {
```
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ // using Math.max here is a good approach when there are only two clauses,
+ // but when this scorer is used for more than two clauses, we may need to
+ // consider other approaches such as avg, as the further out the boundary,
+ // the higher maxScore would be for a scorer, which makes skipping based on
+ // comparison with minCompetitiveScore harder / less effective.
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
+ w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+ } else {
+ // This scorer won't be able to contribute to match for target, setting its maxScore
+ // to 0 so it goes into nonEssentialList
+ w.maxScoreFloat = 0;
+ }
+ }
+ }
+
+ private void repartitionLists() {
+ essentialsScorers.clear();
+ maxScoreSortedEssentialScorers.clear();
+ Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxScoreFloat));
+
+ // Re-partition the scorers into non-essential list and essential list, as defined in
+ // the "Optimizing Top-k Document Retrieval Strategies for Block-Max Indexes" paper.
+ nonEssentialMaxScoreSum = 0;
+ for (DisiWrapper w : allScorers) {
+ if (nonEssentialMaxScoreSum + w.maxScoreFloat < minCompetitiveScore) {
Review Comment:
```suggestion
if (maxScoreSumPropagator.scoreSumUpperBound(nonEssentialMaxScoreSum + w.maxScoreFloat) < minCompetitiveScore) {
```
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ // using Math.max here is a good approach when there are only two clauses,
+ // but when this scorer is used for more than two clauses, we may need to
+ // consider other approaches such as avg, as the further out the boundary,
+ // the higher maxScore would be for a scorer, which makes skipping based on
+ // comparison with minCompetitiveScore harder / less effective.
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
Review Comment:
I don't think `w.doc` can ever be greater than `upTo` given how `upTo` is computed?
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
Review Comment:
likewise here:
```suggestion
&& maxScoreSumPropagator.scoreSumUpperBound(nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat)
```
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
Review Comment:
The comment says that the next block interval boundary is the minimum across all scorers but a few lines below in `updateUpToAndMaxScore` you take the max?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r910552757
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ // using Math.max here is a good approach when there are only two clauses,
+ // but when this scorer is used for more than two clauses, we may need to
+ // consider other approaches such as avg, as the further out the boundary,
+ // the higher maxScore would be for a scorer, which makes skipping based on
+ // comparison with minCompetitiveScore harder / less effective.
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
Review Comment:
It is true that `w.doc` is always less than or equal to `upTo` since we use `Math.max` to compute `upTo`, but I'm also wondering if we should keep this branch still (with additional comment) since the way `upTo` gets computed may change in the future for more clauses, and this branch might be easy to miss? On the other hand, I'm also fine with removing it for now and just update the comment above to mention handling this branch as well.
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ // using Math.max here is a good approach when there are only two clauses,
+ // but when this scorer is used for more than two clauses, we may need to
+ // consider other approaches such as avg, as the further out the boundary,
+ // the higher maxScore would be for a scorer, which makes skipping based on
+ // comparison with minCompetitiveScore harder / less effective.
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
+ w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+ } else {
+ // This scorer won't be able to contribute to match for target, setting its maxScore
+ // to 0 so it goes into nonEssentialList
+ w.maxScoreFloat = 0;
+ }
+ }
+ }
+
+ private void repartitionLists() {
+ essentialsScorers.clear();
+ maxScoreSortedEssentialScorers.clear();
+ Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> scorer.maxScoreFloat));
+
+ // Re-partition the scorers into non-essential list and essential list, as defined in
+ // the "Optimizing Top-k Document Retrieval Strategies for Block-Max Indexes" paper.
+ nonEssentialMaxScoreSum = 0;
+ for (DisiWrapper w : allScorers) {
+ if (nonEssentialMaxScoreSum + w.maxScoreFloat < minCompetitiveScore) {
Review Comment:
Updated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r910552971
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
Review Comment:
I just gave it a try, but it seems caching a partial score from essential list here may require more changes / additional data structure inside `score()`, as this scorer no longer maintains the non-essential list explicitly, but only remembers sum of maxScore from non-essential list, so it will need to differentiate scorers from non-essential list when using the cached essential list score. I think I will prefer to have a follow-up issue on this ?
##########
lucene/core/src/java/org/apache/lucene/search/WANDScorer.java:
##########
@@ -106,7 +106,7 @@ static long scaleMaxScore(float maxScore, int scalingFactor) {
* Scale min competitive scores the same way as max scores but this time by rounding down in order
* to make sure that we do not miss any matches.
*/
- private static long scaleMinScore(float minScore, int scalingFactor) {
+ static long scaleMinScore(float minScore, int scalingFactor) {
Review Comment:
Updated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r911579468
##########
lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java:
##########
@@ -39,6 +39,9 @@ public class DisiWrapper {
// For WANDScorer
long maxScore;
+ // For BlockMaxMaxscoreScorer
+ float maxScoreFloat;
Review Comment:
Ah I like this idea. Updated.
##########
lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java:
##########
@@ -118,6 +118,21 @@ private Scorer getInternal(long leadCost) throws IOException {
leadCost);
}
+ // pure two terms disjunction
+ if (scoreMode == ScoreMode.TOP_SCORES
+ && minShouldMatch == 0
Review Comment:
Updated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1167025504
> I feel the effect would be similar?
Indeed, sorry I had misread your code!
> In terms of next steps, I'm wondering if there's a preference between bulk scorer and scorer implementations when performance improvement is similar
No, it shouldn't matter. Bulk scorers sometimes help yield better performance because it's easier for them to amortize computation across docs, but if they don't yield better performance, there's no point in using a bulk scorer instead of a regular scorer.
I agree that it looks like a great speedup, we should get this in! The benchmark only tests performance of top-level disjunctions of term queries that have two clauses. I'd be curious to get performance numbers for queries like the below ones to see if we need to fine-tune a bit more when this new scorer gets used. Note that I don't think we need to get the performance better for all these queries to merge the change, we could start by only using this new scorer for the (common) case of a top-level disjunction of 2 term queries, and later see if this scorer can handle more disjunctions.
```
OrAndHigMedAndHighMed: (+including +looking) (+date +finished) # disjunction of conjunctions, which don't have as good score upper bounds as term queries
OrHighPhraseHighPhrase: "united states" "new york" # disjunction of phrase queries, which don't have as good score upper bounds as term queries and are slow to advance
AndHighOrMedMed: +be +(mostly interview) # disjunction within conjunction that leads iteration
AndMedOrHighHigh: +interview +(at united) # disjunction within conjunction that doesn't lead iteration
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r909134436
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
Review Comment:
Yup sorry about the confusion! I've renamed the variable to `docScoreUpperBound`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r908663130
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ private final ScoreMode scoreMode;
+
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ /**
+ * Constructs a Scorer
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ * @param scoreMode The scoreMode
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode scoreMode)
+ throws IOException {
+ super(weight);
+ assert scoreMode == ScoreMode.TOP_SCORES;
+
+ this.scoreMode = scoreMode;
+ this.doc = -1;
+
+ this.allScorers = new DisiWrapper[scorers.size()];
+ int i = 0;
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (Scorer scorer : scorers) {
+ DisiWrapper w = new DisiWrapper(scorer);
+ cost += w.cost;
+ allScorers[i++] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ matchedMaxScoreSum += w.scorer.score();
+ }
+
+ if (matchedMaxScoreSum < minCompetitiveScore) {
Review Comment:
Hmm `matchedMaxScoreSum` was actually initialized with `nonEssentialMaxScoreSum` [here](https://github.com/apache/lucene/pull/972/files/cb8ab7485a405e9517049822eef36ae590f2f65b#diff-6d2292dfdef5525e6205d3298d49b2c81c41ec41bea6e30148f3cf22f6a0f5d0R142) , so `nonEssentialMaxScoreSum` was already included in the calculation? Maybe the naming of the variable could be confusing?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] jpountz commented on a diff in pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
jpountz commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r910683270
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private double nonEssentialMaxScoreSum;
+
+ private long cost;
Review Comment:
nit: let's make it final
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
+
+ if ((float) docScoreUpperBound < minCompetitiveScore) {
+ // skip straight to next candidate doc from essential scorer
+ int docId = top.doc;
+ do {
+ top.doc = top.iterator.nextDoc();
+ top = essentialsScorers.updateTop();
+ } while (top.doc == docId);
+
+ target = top.doc;
+ } else {
+ return doc = top.doc;
+ }
+ }
+ }
+ }
+ }
+
+ private void movePotentiallyNonCompetitiveScorers() {
+ while (maxScoreSortedEssentialScorers.size() > 0
+ && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+ < minCompetitiveScore) {
+ DisiWrapper nextLeastContributingScorer =
+ maxScoreSortedEssentialScorers.removeFirst();
+ nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat;
+ }
+
+ // list adjusted
+ if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) {
+ essentialsScorers.clear();
+ for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+ essentialsScorers.add(w);
+ }
+ }
+ }
+
+ private void updateMaxScoresAndLists(int target) throws IOException {
+ assert target > upTo;
+ // Next candidate doc id is above interval boundary, or minCompetitive has increased.
+ // Find next interval boundary.
+ // Block boundary alignment strategy is adapted from "Optimizing Top-k Document
+ // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel.
+ // Find the block interval boundary that is the minimum of all participating scorer's
+ // block boundary. Then run BMM within each interval.
+ updateUpToAndMaxScore(target);
+ repartitionLists();
+ }
+
+ private void updateUpToAndMaxScore(int target) throws IOException {
+ // reset upTo
+ upTo = -1;
+ for (DisiWrapper w : allScorers) {
+ // using Math.max here is a good approach when there are only two clauses,
+ // but when this scorer is used for more than two clauses, we may need to
+ // consider other approaches such as avg, as the further out the boundary,
+ // the higher maxScore would be for a scorer, which makes skipping based on
+ // comparison with minCompetitiveScore harder / less effective.
+ upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo);
+ }
+ assert target <= upTo;
+
+ for (DisiWrapper w : allScorers) {
+ if (w.doc <= upTo) {
Review Comment:
I have a slight preference for the second approach (remove plus a comment) so that our test coverage build doesn't show up this branch as a branch that is never tested.
##########
lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java:
##########
@@ -118,6 +118,21 @@ private Scorer getInternal(long leadCost) throws IOException {
leadCost);
}
+ // pure two terms disjunction
+ if (scoreMode == ScoreMode.TOP_SCORES
+ && minShouldMatch == 0
Review Comment:
`minShouldMatch = 1` also qualifies
```suggestion
&& minShouldMatch <= 1
```
##########
lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java:
##########
@@ -39,6 +39,9 @@ public class DisiWrapper {
// For WANDScorer
long maxScore;
+ // For BlockMaxMaxscoreScorer
+ float maxScoreFloat;
Review Comment:
Maybe we should call this one `maxScore` and rename the other one `scaledMaxScore` or something like that
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
Review Comment:
Could it be pkg-private?
##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+ // current doc ID of the leads
+ private int doc;
+
+ // doc id boundary that all scorers maxScore are valid
+ private int upTo = -1;
+
+ // heap of scorers ordered by doc ID
+ private final DisiPriorityQueue essentialsScorers;
+ // list of scorers ordered by maxScore
+ private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+ private final DisiWrapper[] allScorers;
+
+ // sum of max scores of scorers in nonEssentialScorers list
+ private float nonEssentialMaxScoreSum;
+
+ private long cost;
+
+ private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+ // scaled min competitive score
+ private float minCompetitiveScore = 0;
+
+ private int cachedScoredDoc = -1;
+ private float cachedScore = 0;
+
+ /**
+ * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm
+ * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to
+ * WANDScorer, and could be used for simple disjunction queries.
+ *
+ * @param weight The weight to be used.
+ * @param scorers The sub scorers this Scorer should iterate on for optional clauses
+ */
+ public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException {
+ super(weight);
+
+ this.doc = -1;
+ this.allScorers = new DisiWrapper[scorers.size()];
+ this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+ this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+ long cost = 0;
+ for (int i = 0; i < scorers.size(); i++) {
+ DisiWrapper w = new DisiWrapper(scorers.get(i));
+ cost += w.cost;
+ allScorers[i] = w;
+ }
+
+ this.cost = cost;
+ maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+ return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+ }
+
+ @Override
+ public TwoPhaseIterator twoPhaseIterator() {
+ DocIdSetIterator approximation =
+ new DocIdSetIterator() {
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ while (true) {
+
+ if (target > upTo) {
+ updateMaxScoresAndLists(target);
+ } else {
+ // minCompetitiveScore might have increased,
+ // move potentially no-longer-competitive scorers from essential to non-essential
+ // list
+ movePotentiallyNonCompetitiveScorers();
+ }
+
+ assert target <= upTo;
+
+ DisiWrapper top = essentialsScorers.top();
+
+ if (top == null) {
+ // all scorers in non-essential list, skip to next boundary or return no_more_docs
+ if (upTo == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else {
+ target = upTo + 1;
+ }
+ } else {
+ // position all scorers in essential list to on or after target
+ while (top.doc < target) {
+ top.doc = top.iterator.advance(target);
+ top = essentialsScorers.updateTop();
+ }
+
+ if (top.doc == NO_MORE_DOCS) {
+ return doc = NO_MORE_DOCS;
+ } else if (top.doc > upTo) {
+ target = upTo + 1;
+ } else {
+ double docScoreUpperBound = nonEssentialMaxScoreSum;
+
+ for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) {
+ docScoreUpperBound += w.scorer.score();
+ }
Review Comment:
:+1:
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1171893517
> I like the idea of creating WANDScorer more explicitly in tests. It doesn't look easy though and this change is already great so I wonder if we should keep it for a follow-up.
Sounds good. I've created this follow-up issue https://issues.apache.org/jira/browse/LUCENE-10635 .
> I reviewed the change and left some very minor comments but it looks great to me overall. Let's get it in.
Awesome, thanks for all the review and feedback @jpountz, I really appreciate it! Iterating on the solution and seeing it improved each time is a lot of fun and I enjoy this process a lot!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1172956347
Thanks again @jpountz ! I've created the above PR to backport these changes to `branch_9x`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1162613846
Hi @jpountz , I have adapted the original BMM PR https://github.com/apache/lucene/pull/101 to the latest codebase and run further experiments on using it for 2 clauses disjunction. The results look both encouraging and strange :D
When I run `python3 src/python/localrun.py -source wikimedium10m` with only `OrHighLow`, `OrHighHigh` and `OrHighMed` tasks from ` tasks/wikimedium.10M.nostopwords.tasks tasks/wikimedium.10M.nostopwords.tasks` (by removing the other tasks), I got pretty impressive speedup on average:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
PKLookup 173.31 (24.6%) 181.79 (26.8%) 4.9% ( -37% - 74%) 0.547
OrHighLow 166.70 (62.8%) 385.94 (101.5%) 131.5% ( -20% - 794%) 0.000
OrHighHigh 9.27 (48.9%) 23.44 (85.9%) 152.9% ( 12% - 562%) 0.000
OrHighMed 18.45 (61.3%) 55.92 (137.3%) 203.0% ( 2% - 1037%) 0.000
```
However, when I run all the tasks, `OrHighLow`, `OrHighHigh` and `OrHighMed` have only moderate speedup on average and sometimes even slightly negatively impacted:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighHigh 35.23 (7.2%) 23.86 (7.0%) -32.3% ( -43% - -19%) 0.000
OrHighLow 898.97 (4.4%) 788.65 (4.2%) -12.3% ( -20% - -3%) 0.000
BrowseDateSSDVFacets 2.62 (27.0%) 2.43 (18.8%) -7.4% ( -41% - 52%) 0.312
HighSpanNear 21.86 (6.4%) 21.00 (6.1%) -4.0% ( -15% - 9%) 0.045
Fuzzy2 94.11 (12.4%) 90.59 (9.8%) -3.7% ( -23% - 21%) 0.290
LowSloppyPhrase 65.63 (8.2%) 63.99 (8.6%) -2.5% ( -17% - 15%) 0.347
HighSloppyPhrase 17.25 (5.3%) 16.84 (5.3%) -2.4% ( -12% - 8%) 0.154
TermDTSort 160.18 (8.2%) 156.49 (9.9%) -2.3% ( -18% - 17%) 0.423
HighTermDayOfYearSort 164.86 (6.8%) 161.77 (10.1%) -1.9% ( -17% - 16%) 0.490
OrHighMedDayTaxoFacets 11.05 (7.1%) 10.86 (7.3%) -1.7% ( -15% - 13%) 0.465
AndHighLow 1482.47 (4.0%) 1459.63 (10.6%) -1.5% ( -15% - 13%) 0.544
MedSpanNear 27.77 (7.2%) 27.49 (6.1%) -1.0% ( -13% - 13%) 0.628
HighTermTitleBDVSort 197.53 (7.4%) 195.53 (6.3%) -1.0% ( -13% - 13%) 0.640
AndHighMedDayTaxoFacets 43.61 (8.7%) 43.19 (10.1%) -1.0% ( -18% - 19%) 0.745
HighIntervalsOrdered 17.38 (8.7%) 17.26 (7.5%) -0.7% ( -15% - 16%) 0.782
HighPhrase 454.15 (5.0%) 451.67 (8.7%) -0.5% ( -13% - 13%) 0.807
BrowseRandomLabelSSDVFacets 15.40 (8.1%) 15.32 (7.3%) -0.5% ( -14% - 16%) 0.837
AndHighHighDayTaxoFacets 16.94 (7.0%) 16.87 (6.6%) -0.5% ( -13% - 14%) 0.834
LowSpanNear 9.08 (4.8%) 9.05 (4.3%) -0.3% ( -9% - 9%) 0.838
Wildcard 55.15 (11.3%) 55.01 (12.0%) -0.2% ( -21% - 26%) 0.947
MedPhrase 976.56 (2.8%) 977.29 (3.3%) 0.1% ( -5% - 6%) 0.939
MedTermDayTaxoFacets 77.21 (8.6%) 77.46 (8.7%) 0.3% ( -15% - 19%) 0.908
OrNotHighLow 1187.34 (5.1%) 1191.80 (5.3%) 0.4% ( -9% - 11%) 0.819
OrHighNotHigh 1556.42 (4.4%) 1566.26 (4.5%) 0.6% ( -7% - 9%) 0.654
LowIntervalsOrdered 158.96 (6.4%) 160.03 (8.9%) 0.7% ( -13% - 17%) 0.785
OrNotHighHigh 1427.22 (3.8%) 1436.97 (5.0%) 0.7% ( -7% - 9%) 0.628
Fuzzy1 116.55 (11.4%) 117.41 (9.4%) 0.7% ( -18% - 24%) 0.823
LowTerm 3470.46 (5.9%) 3500.25 (5.9%) 0.9% ( -10% - 13%) 0.644
HighTermMonthSort 169.22 (10.4%) 170.68 (14.9%) 0.9% ( -22% - 29%) 0.832
IntNRQ 115.77 (22.6%) 116.95 (21.3%) 1.0% ( -34% - 57%) 0.883
MedTerm 3042.06 (4.5%) 3080.17 (5.4%) 1.3% ( -8% - 11%) 0.427
HighTerm 2407.19 (5.5%) 2440.56 (4.1%) 1.4% ( -7% - 11%) 0.369
Prefix3 396.92 (10.2%) 403.19 (8.6%) 1.6% ( -15% - 22%) 0.595
OrNotHighMed 1695.31 (3.6%) 1722.43 (5.5%) 1.6% ( -7% - 11%) 0.274
MedSloppyPhrase 13.19 (4.5%) 13.40 (5.0%) 1.6% ( -7% - 11%) 0.283
OrHighNotLow 1473.94 (6.7%) 1500.95 (6.6%) 1.8% ( -10% - 16%) 0.383
AndHighMed 201.69 (4.5%) 205.65 (9.1%) 2.0% ( -11% - 16%) 0.387
PKLookup 247.69 (11.3%) 253.24 (9.6%) 2.2% ( -16% - 26%) 0.499
MedIntervalsOrdered 30.40 (8.1%) 31.13 (7.7%) 2.4% ( -12% - 19%) 0.338
OrHighNotMed 1534.55 (4.5%) 1571.83 (3.9%) 2.4% ( -5% - 11%) 0.068
Respell 90.55 (7.9%) 92.75 (8.8%) 2.4% ( -13% - 20%) 0.359
AndHighHigh 65.14 (7.1%) 67.16 (8.3%) 3.1% ( -11% - 19%) 0.206
BrowseDayOfYearSSDVFacets 20.96 (9.7%) 21.65 (11.1%) 3.3% ( -15% - 26%) 0.320
LowPhrase 63.71 (6.9%) 65.86 (9.2%) 3.4% ( -11% - 20%) 0.191
BrowseMonthSSDVFacets 22.49 (13.6%) 23.62 (14.8%) 5.0% ( -20% - 38%) 0.263
BrowseMonthTaxoFacets 26.25 (43.5%) 34.10 (40.2%) 29.9% ( -37% - 200%) 0.024
BrowseDateTaxoFacets 22.04 (40.4%) 29.87 (63.1%) 35.5% ( -48% - 233%) 0.034
BrowseDayOfYearTaxoFacets 22.07 (39.3%) 31.04 (64.1%) 40.6% ( -45% - 236%) 0.016
OrHighMed 59.30 (9.3%) 84.18 (20.1%) 41.9% ( 11% - 78%) 0.000
BrowseRandomLabelTaxoFacets 20.38 (52.4%) 30.77 (88.6%) 50.9% ( -59% - 403%) 0.027
```
This seems to suggest tasks run may interfere with each other as opposed to independent? Do you have any suggestion where I can look into next to confirm the performance impact of this change ?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1166714692
Hi @jpountz, I've taken some ideas from your bulk scorer implementation and was able to simplify my code as well as to boost the performance when under default `SEARCH_NUM_THREADS` [here](https://github.com/apache/lucene/pull/972/commits/cb8ab7485a405e9517049822eef36ae590f2f65b). The benchmark results look similar now albeit a bit varying :
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
BrowseDateSSDVFacets 4.38 (35.8%) 4.01 (30.3%) -8.3% ( -54% - 90%) 0.431
Prefix3 811.56 (5.6%) 782.08 (7.8%) -3.6% ( -16% - 10%) 0.091
OrHighMedDayTaxoFacets 11.42 (5.6%) 11.11 (8.0%) -2.7% ( -15% - 11%) 0.223
IntNRQ 297.19 (1.5%) 291.62 (5.0%) -1.9% ( -8% - 4%) 0.107
Wildcard 269.43 (5.0%) 264.57 (6.4%) -1.8% ( -12% - 10%) 0.319
BrowseRandomLabelSSDVFacets 20.22 (8.8%) 19.86 (8.4%) -1.8% ( -17% - 16%) 0.518
HighTermTitleBDVSort 236.73 (8.6%) 232.93 (8.6%) -1.6% ( -17% - 17%) 0.555
AndHighHighDayTaxoFacets 12.67 (2.9%) 12.48 (4.4%) -1.5% ( -8% - 5%) 0.186
BrowseMonthTaxoFacets 32.18 (36.3%) 31.72 (38.8%) -1.4% ( -56% - 115%) 0.904
LowPhrase 1725.41 (3.3%) 1702.14 (5.3%) -1.3% ( -9% - 7%) 0.334
MedSloppyPhrase 111.58 (3.2%) 110.16 (3.8%) -1.3% ( -8% - 5%) 0.250
HighPhrase 930.18 (2.5%) 919.75 (3.4%) -1.1% ( -6% - 4%) 0.234
MedTermDayTaxoFacets 46.10 (3.9%) 45.68 (4.8%) -0.9% ( -9% - 8%) 0.514
TermDTSort 341.03 (7.2%) 338.23 (8.5%) -0.8% ( -15% - 15%) 0.740
AndHighMedDayTaxoFacets 39.88 (1.9%) 39.57 (3.1%) -0.8% ( -5% - 4%) 0.349
HighTermDayOfYearSort 148.85 (7.6%) 147.86 (8.3%) -0.7% ( -15% - 16%) 0.792
HighTermMonthSort 218.46 (8.6%) 217.06 (9.2%) -0.6% ( -16% - 18%) 0.819
OrNotHighLow 2696.50 (5.4%) 2681.95 (5.0%) -0.5% ( -10% - 10%) 0.743
LowSloppyPhrase 22.79 (2.0%) 22.69 (2.9%) -0.4% ( -5% - 4%) 0.585
Fuzzy2 125.08 (2.7%) 124.54 (4.3%) -0.4% ( -7% - 6%) 0.708
HighSloppyPhrase 21.02 (2.3%) 20.94 (3.0%) -0.4% ( -5% - 5%) 0.629
OrHighNotMed 1805.04 (4.7%) 1797.98 (5.8%) -0.4% ( -10% - 10%) 0.816
BrowseMonthSSDVFacets 29.37 (14.0%) 29.26 (13.4%) -0.4% ( -24% - 31%) 0.933
MedPhrase 205.52 (1.7%) 204.78 (3.0%) -0.4% ( -4% - 4%) 0.643
Fuzzy1 128.47 (2.8%) 128.05 (4.2%) -0.3% ( -7% - 6%) 0.772
AndHighLow 2126.24 (5.3%) 2124.42 (5.6%) -0.1% ( -10% - 11%) 0.960
Respell 83.33 (3.2%) 83.33 (4.2%) 0.0% ( -7% - 7%) 0.998
OrHighNotHigh 1415.44 (4.4%) 1419.78 (4.5%) 0.3% ( -8% - 9%) 0.827
OrHighNotLow 1655.08 (4.4%) 1663.51 (4.7%) 0.5% ( -8% - 10%) 0.725
OrNotHighHigh 1035.89 (3.1%) 1042.85 (4.6%) 0.7% ( -6% - 8%) 0.587
PKLookup 283.77 (5.1%) 285.92 (4.5%) 0.8% ( -8% - 10%) 0.616
LowTerm 3616.62 (4.1%) 3655.48 (5.3%) 1.1% ( -8% - 10%) 0.476
HighSpanNear 15.54 (2.2%) 15.71 (3.5%) 1.1% ( -4% - 7%) 0.241
MedTerm 2615.07 (4.0%) 2645.27 (4.0%) 1.2% ( -6% - 9%) 0.364
OrNotHighMed 1759.45 (4.2%) 1779.94 (4.6%) 1.2% ( -7% - 10%) 0.406
LowSpanNear 66.06 (2.9%) 66.83 (4.3%) 1.2% ( -5% - 8%) 0.316
BrowseDayOfYearSSDVFacets 26.94 (10.7%) 27.30 (9.7%) 1.3% ( -17% - 24%) 0.684
MedIntervalsOrdered 86.40 (5.1%) 87.58 (4.8%) 1.4% ( -8% - 11%) 0.387
AndHighMed 113.83 (3.4%) 115.48 (5.8%) 1.5% ( -7% - 11%) 0.334
AndHighHigh 42.71 (2.9%) 43.35 (5.8%) 1.5% ( -6% - 10%) 0.300
HighTerm 2639.27 (4.2%) 2678.82 (4.4%) 1.5% ( -6% - 10%) 0.273
HighIntervalsOrdered 11.98 (7.1%) 12.17 (6.7%) 1.5% ( -11% - 16%) 0.480
MedSpanNear 97.91 (2.7%) 99.79 (4.0%) 1.9% ( -4% - 8%) 0.077
LowIntervalsOrdered 222.06 (12.2%) 229.67 (10.9%) 3.4% ( -17% - 30%) 0.348
BrowseDateTaxoFacets 28.41 (35.2%) 31.76 (47.7%) 11.8% ( -52% - 146%) 0.374
BrowseDayOfYearTaxoFacets 28.45 (34.8%) 31.91 (47.4%) 12.2% ( -51% - 144%) 0.355
BrowseRandomLabelTaxoFacets 27.25 (44.3%) 31.80 (60.7%) 16.7% ( -61% - 218%) 0.321
OrHighLow 1204.21 (3.6%) 1475.66 (6.9%) 22.5% ( 11% - 34%) 0.000
OrHighMed 207.04 (6.4%) 285.96 (5.7%) 38.1% ( 24% - 53%) 0.000
OrHighHigh 50.80 (6.2%) 88.52 (8.1%) 74.3% ( 56% - 94%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
BrowseRandomLabelSSDVFacets 20.99 (8.4%) 20.06 (7.6%) -4.4% ( -18% - 12%) 0.081
Prefix3 264.64 (5.6%) 258.91 (7.0%) -2.2% ( -13% - 11%) 0.278
HighSpanNear 55.97 (2.8%) 55.33 (3.4%) -1.1% ( -7% - 5%) 0.240
BrowseDayOfYearSSDVFacets 27.04 (8.0%) 26.74 (4.1%) -1.1% ( -12% - 12%) 0.591
IntNRQ 123.74 (25.9%) 122.52 (22.6%) -1.0% ( -39% - 64%) 0.898
Wildcard 144.91 (5.7%) 143.53 (5.5%) -0.9% ( -11% - 10%) 0.593
MedSpanNear 58.12 (2.2%) 57.62 (2.6%) -0.9% ( -5% - 4%) 0.266
LowSloppyPhrase 56.23 (2.7%) 55.75 (2.5%) -0.8% ( -5% - 4%) 0.309
LowSpanNear 83.48 (1.9%) 82.83 (2.2%) -0.8% ( -4% - 3%) 0.237
AndHighMedDayTaxoFacets 66.27 (2.1%) 65.85 (1.8%) -0.6% ( -4% - 3%) 0.307
MedTermDayTaxoFacets 61.16 (4.3%) 60.78 (4.9%) -0.6% ( -9% - 8%) 0.676
HighTermMonthSort 197.74 (11.7%) 196.88 (9.9%) -0.4% ( -19% - 23%) 0.898
MedTerm 3912.22 (3.9%) 3899.21 (4.0%) -0.3% ( -7% - 7%) 0.789
AndHighHighDayTaxoFacets 52.69 (1.5%) 52.53 (1.5%) -0.3% ( -3% - 2%) 0.515
PKLookup 285.12 (4.8%) 284.29 (4.2%) -0.3% ( -8% - 9%) 0.837
MedPhrase 155.83 (2.1%) 155.88 (1.7%) 0.0% ( -3% - 3%) 0.962
OrNotHighLow 2052.07 (4.9%) 2053.48 (5.0%) 0.1% ( -9% - 10%) 0.965
Respell 86.89 (3.3%) 86.99 (3.1%) 0.1% ( -6% - 6%) 0.912
Fuzzy1 136.62 (1.9%) 136.98 (2.6%) 0.3% ( -4% - 4%) 0.709
HighPhrase 652.65 (2.1%) 654.44 (1.8%) 0.3% ( -3% - 4%) 0.657
Fuzzy2 104.64 (2.7%) 105.09 (2.3%) 0.4% ( -4% - 5%) 0.591
OrNotHighHigh 1558.41 (2.5%) 1565.23 (2.4%) 0.4% ( -4% - 5%) 0.572
AndHighLow 1378.89 (3.6%) 1385.39 (3.4%) 0.5% ( -6% - 7%) 0.667
HighSloppyPhrase 43.66 (3.8%) 43.88 (3.3%) 0.5% ( -6% - 7%) 0.644
BrowseDateSSDVFacets 4.31 (36.0%) 4.34 (32.6%) 0.6% ( -49% - 107%) 0.955
LowIntervalsOrdered 24.44 (3.3%) 24.60 (3.5%) 0.7% ( -5% - 7%) 0.536
MedSloppyPhrase 121.45 (3.4%) 122.31 (3.2%) 0.7% ( -5% - 7%) 0.499
OrNotHighMed 1518.27 (2.9%) 1530.49 (3.5%) 0.8% ( -5% - 7%) 0.432
LowPhrase 1590.38 (3.3%) 1604.25 (3.9%) 0.9% ( -6% - 8%) 0.447
HighTermDayOfYearSort 227.17 (7.8%) 229.44 (7.7%) 1.0% ( -13% - 17%) 0.684
OrHighNotMed 1426.20 (4.6%) 1440.64 (3.9%) 1.0% ( -7% - 9%) 0.453
HighIntervalsOrdered 6.34 (4.4%) 6.42 (4.8%) 1.2% ( -7% - 10%) 0.413
LowTerm 3170.27 (5.2%) 3208.75 (6.0%) 1.2% ( -9% - 13%) 0.495
OrHighNotLow 1904.11 (4.6%) 1929.56 (3.8%) 1.3% ( -6% - 10%) 0.316
AndHighMed 301.92 (4.1%) 306.40 (3.9%) 1.5% ( -6% - 9%) 0.243
MedIntervalsOrdered 44.71 (4.1%) 45.38 (4.5%) 1.5% ( -6% - 10%) 0.274
OrHighNotHigh 2139.92 (2.9%) 2172.45 (4.3%) 1.5% ( -5% - 8%) 0.191
HighTermTitleBDVSort 300.97 (8.9%) 306.56 (8.5%) 1.9% ( -14% - 21%) 0.499
HighTerm 2745.94 (3.5%) 2799.23 (3.6%) 1.9% ( -4% - 9%) 0.083
TermDTSort 338.13 (6.3%) 345.25 (6.8%) 2.1% ( -10% - 16%) 0.309
AndHighHigh 105.27 (3.0%) 107.76 (3.6%) 2.4% ( -4% - 9%) 0.024
BrowseMonthTaxoFacets 25.29 (31.4%) 25.99 (35.5%) 2.7% ( -48% - 101%) 0.795
OrHighMedDayTaxoFacets 7.05 (7.7%) 7.26 (5.0%) 3.0% ( -9% - 17%) 0.146
BrowseDayOfYearTaxoFacets 27.07 (29.3%) 28.39 (42.0%) 4.8% ( -51% - 107%) 0.672
BrowseDateTaxoFacets 27.00 (29.5%) 28.33 (42.5%) 4.9% ( -51% - 109%) 0.671
BrowseMonthSSDVFacets 29.01 (10.5%) 30.45 (10.7%) 5.0% ( -14% - 29%) 0.139
BrowseRandomLabelTaxoFacets 25.85 (37.4%) 28.27 (58.1%) 9.4% ( -62% - 167%) 0.544
OrHighLow 1322.31 (2.8%) 1506.33 (3.5%) 13.9% ( 7% - 20%) 0.000
OrHighHigh 37.60 (5.5%) 59.95 (9.9%) 59.4% ( 41% - 79%) 0.000
OrHighMed 146.48 (4.4%) 320.00 (10.1%) 118.5% ( 99% - 139%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
HighTermTitleBDVSort 232.99 (8.7%) 227.90 (6.6%) -2.2% ( -16% - 14%) 0.372
BrowseDayOfYearSSDVFacets 27.67 (9.1%) 27.14 (10.6%) -1.9% ( -19% - 19%) 0.533
AndHighMed 195.79 (4.8%) 192.91 (4.3%) -1.5% ( -10% - 8%) 0.309
Prefix3 752.40 (9.1%) 742.56 (7.8%) -1.3% ( -16% - 17%) 0.627
HighTerm 2655.43 (5.3%) 2622.06 (3.1%) -1.3% ( -9% - 7%) 0.360
AndHighHigh 171.69 (4.6%) 169.66 (4.0%) -1.2% ( -9% - 7%) 0.390
OrNotHighHigh 1275.59 (3.6%) 1261.66 (2.5%) -1.1% ( -6% - 5%) 0.259
OrHighNotHigh 1670.68 (4.4%) 1655.08 (3.0%) -0.9% ( -7% - 6%) 0.433
MedSloppyPhrase 12.51 (2.8%) 12.40 (2.5%) -0.9% ( -5% - 4%) 0.277
OrHighNotMed 1677.16 (4.8%) 1663.03 (3.0%) -0.8% ( -8% - 7%) 0.508
MedTerm 4174.46 (3.9%) 4144.81 (4.6%) -0.7% ( -8% - 8%) 0.598
LowTerm 3205.62 (4.9%) 3183.90 (4.5%) -0.7% ( -9% - 9%) 0.647
IntNRQ 86.34 (8.2%) 85.91 (8.2%) -0.5% ( -15% - 17%) 0.846
PKLookup 290.40 (4.1%) 289.10 (4.4%) -0.4% ( -8% - 8%) 0.740
LowSpanNear 18.90 (2.9%) 18.81 (3.1%) -0.4% ( -6% - 5%) 0.640
OrNotHighMed 1413.70 (2.7%) 1408.55 (3.0%) -0.4% ( -5% - 5%) 0.683
AndHighLow 1965.22 (4.7%) 1958.33 (3.7%) -0.4% ( -8% - 8%) 0.794
OrHighNotLow 1628.65 (4.1%) 1624.17 (3.9%) -0.3% ( -7% - 8%) 0.828
LowSloppyPhrase 244.50 (3.0%) 243.99 (2.2%) -0.2% ( -5% - 5%) 0.802
MedSpanNear 69.69 (3.4%) 69.56 (2.8%) -0.2% ( -6% - 6%) 0.844
LowPhrase 176.86 (3.9%) 176.70 (4.0%) -0.1% ( -7% - 8%) 0.941
HighSpanNear 25.67 (4.0%) 25.65 (3.6%) -0.1% ( -7% - 7%) 0.964
HighIntervalsOrdered 8.08 (10.1%) 8.08 (9.8%) 0.0% ( -18% - 22%) 0.994
LowIntervalsOrdered 90.46 (6.5%) 90.49 (6.4%) 0.0% ( -12% - 13%) 0.988
Wildcard 297.27 (5.4%) 297.43 (4.4%) 0.1% ( -9% - 10%) 0.972
BrowseMonthSSDVFacets 28.60 (8.6%) 28.66 (11.6%) 0.2% ( -18% - 22%) 0.947
MedIntervalsOrdered 69.85 (6.4%) 70.00 (6.4%) 0.2% ( -11% - 13%) 0.913
AndHighHighDayTaxoFacets 32.05 (1.8%) 32.13 (2.4%) 0.3% ( -3% - 4%) 0.705
HighSloppyPhrase 15.93 (3.3%) 16.00 (3.3%) 0.4% ( -6% - 7%) 0.682
OrNotHighLow 1784.41 (4.1%) 1792.59 (4.5%) 0.5% ( -7% - 9%) 0.734
Fuzzy1 146.35 (2.9%) 147.05 (3.8%) 0.5% ( -6% - 7%) 0.656
Respell 114.58 (2.5%) 115.18 (3.1%) 0.5% ( -5% - 6%) 0.559
AndHighMedDayTaxoFacets 134.21 (1.7%) 135.03 (2.6%) 0.6% ( -3% - 4%) 0.373
TermDTSort 176.85 (5.1%) 177.94 (6.9%) 0.6% ( -10% - 13%) 0.747
HighPhrase 134.11 (2.2%) 135.02 (2.8%) 0.7% ( -4% - 5%) 0.394
Fuzzy2 128.74 (2.5%) 129.63 (3.2%) 0.7% ( -4% - 6%) 0.450
MedPhrase 85.90 (2.2%) 86.68 (1.9%) 0.9% ( -3% - 5%) 0.158
MedTermDayTaxoFacets 69.57 (4.2%) 70.39 (4.0%) 1.2% ( -6% - 9%) 0.368
HighTermDayOfYearSort 196.16 (2.7%) 198.50 (6.6%) 1.2% ( -7% - 10%) 0.455
BrowseRandomLabelSSDVFacets 20.42 (9.4%) 20.68 (9.7%) 1.3% ( -16% - 22%) 0.671
BrowseDateSSDVFacets 4.58 (28.7%) 4.66 (32.0%) 1.8% ( -45% - 87%) 0.855
OrHighMedDayTaxoFacets 19.97 (7.5%) 20.62 (6.2%) 3.2% ( -9% - 18%) 0.138
HighTermMonthSort 212.99 (5.8%) 219.92 (8.9%) 3.3% ( -10% - 19%) 0.172
BrowseMonthTaxoFacets 27.65 (35.6%) 31.27 (47.3%) 13.1% ( -51% - 149%) 0.323
BrowseDateTaxoFacets 24.81 (27.6%) 28.54 (43.5%) 15.1% ( -43% - 118%) 0.191
BrowseDayOfYearTaxoFacets 24.85 (26.6%) 28.68 (43.3%) 15.4% ( -43% - 116%) 0.175
BrowseRandomLabelTaxoFacets 23.80 (38.7%) 28.05 (56.9%) 17.8% ( -56% - 185%) 0.247
OrHighLow 886.55 (3.3%) 1105.54 (5.0%) 24.7% ( 15% - 34%) 0.000
OrHighMed 209.42 (6.1%) 304.52 (4.7%) 45.4% ( 32% - 59%) 0.000
OrHighHigh 46.51 (6.4%) 68.85 (6.9%) 48.0% ( 32% - 65%) 0.000
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
HighTermDayOfYearSort 268.07 (7.6%) 260.40 (3.8%) -2.9% ( -13% - 9%) 0.132
PKLookup 284.74 (5.7%) 278.54 (5.8%) -2.2% ( -12% - 9%) 0.234
BrowseRandomLabelSSDVFacets 20.82 (10.2%) 20.45 (6.7%) -1.8% ( -16% - 16%) 0.512
Fuzzy2 234.77 (3.0%) 230.58 (3.3%) -1.8% ( -7% - 4%) 0.073
MedTermDayTaxoFacets 42.28 (4.7%) 41.56 (5.6%) -1.7% ( -11% - 9%) 0.298
HighTermMonthSort 218.63 (9.6%) 215.12 (7.8%) -1.6% ( -17% - 17%) 0.563
TermDTSort 262.87 (7.8%) 258.91 (7.2%) -1.5% ( -15% - 14%) 0.527
Prefix3 349.11 (7.1%) 344.04 (8.0%) -1.5% ( -15% - 14%) 0.544
AndHighHighDayTaxoFacets 20.76 (2.6%) 20.47 (3.6%) -1.4% ( -7% - 4%) 0.154
OrHighMedDayTaxoFacets 9.09 (5.7%) 8.96 (6.1%) -1.4% ( -12% - 11%) 0.460
Respell 109.27 (3.4%) 107.82 (2.5%) -1.3% ( -6% - 4%) 0.158
Fuzzy1 177.60 (3.9%) 175.30 (2.9%) -1.3% ( -7% - 5%) 0.229
Wildcard 391.34 (8.0%) 386.37 (9.0%) -1.3% ( -16% - 17%) 0.636
MedPhrase 1174.01 (3.6%) 1160.58 (4.8%) -1.1% ( -9% - 7%) 0.396
AndHighMedDayTaxoFacets 34.66 (2.7%) 34.27 (3.4%) -1.1% ( -7% - 5%) 0.243
AndHighMed 334.12 (6.3%) 330.86 (4.8%) -1.0% ( -11% - 10%) 0.584
AndHighLow 1720.82 (5.3%) 1704.25 (4.3%) -1.0% ( -10% - 9%) 0.527
MedSpanNear 18.51 (3.6%) 18.36 (3.8%) -0.8% ( -7% - 6%) 0.491
LowPhrase 159.66 (6.1%) 158.43 (6.3%) -0.8% ( -12% - 12%) 0.693
LowSpanNear 14.66 (3.4%) 14.54 (3.4%) -0.8% ( -7% - 6%) 0.475
OrHighNotLow 1848.94 (4.3%) 1837.32 (4.4%) -0.6% ( -8% - 8%) 0.646
HighSpanNear 36.23 (3.5%) 36.04 (2.7%) -0.5% ( -6% - 5%) 0.590
OrNotHighHigh 1355.23 (4.1%) 1349.28 (4.7%) -0.4% ( -8% - 8%) 0.753
HighPhrase 53.43 (2.7%) 53.22 (3.9%) -0.4% ( -6% - 6%) 0.706
HighSloppyPhrase 54.76 (3.5%) 54.68 (4.3%) -0.2% ( -7% - 7%) 0.899
HighTermTitleBDVSort 199.46 (9.0%) 199.23 (12.6%) -0.1% ( -19% - 23%) 0.973
OrHighNotHigh 1289.90 (3.5%) 1292.38 (3.8%) 0.2% ( -6% - 7%) 0.869
MedSloppyPhrase 23.10 (2.2%) 23.15 (2.7%) 0.2% ( -4% - 5%) 0.780
OrHighNotMed 1880.20 (3.9%) 1886.66 (5.3%) 0.3% ( -8% - 9%) 0.814
OrNotHighLow 1524.76 (3.7%) 1530.74 (4.2%) 0.4% ( -7% - 8%) 0.755
IntNRQ 1154.82 (2.3%) 1159.36 (4.6%) 0.4% ( -6% - 7%) 0.731
BrowseDayOfYearSSDVFacets 26.46 (9.5%) 26.56 (8.8%) 0.4% ( -16% - 20%) 0.890
OrNotHighMed 1280.97 (3.4%) 1286.37 (3.4%) 0.4% ( -6% - 7%) 0.695
LowSloppyPhrase 18.44 (3.1%) 18.53 (3.4%) 0.4% ( -5% - 7%) 0.671
AndHighHigh 64.90 (3.7%) 65.31 (3.3%) 0.6% ( -6% - 7%) 0.565
LowTerm 4926.25 (5.3%) 4962.05 (6.0%) 0.7% ( -10% - 12%) 0.684
HighIntervalsOrdered 12.83 (4.8%) 12.93 (4.5%) 0.8% ( -8% - 10%) 0.608
LowIntervalsOrdered 199.64 (5.5%) 201.26 (5.2%) 0.8% ( -9% - 12%) 0.630
MedIntervalsOrdered 18.57 (5.1%) 18.72 (4.9%) 0.8% ( -8% - 11%) 0.597
HighTerm 3069.83 (4.1%) 3109.22 (5.1%) 1.3% ( -7% - 10%) 0.378
MedTerm 3000.10 (4.0%) 3039.40 (5.1%) 1.3% ( -7% - 10%) 0.365
BrowseMonthSSDVFacets 29.00 (12.5%) 30.24 (13.9%) 4.3% ( -19% - 35%) 0.302
BrowseDateSSDVFacets 4.53 (31.8%) 4.88 (31.7%) 7.7% ( -42% - 104%) 0.445
BrowseDateTaxoFacets 27.12 (37.5%) 29.80 (45.3%) 9.9% ( -53% - 148%) 0.453
BrowseDayOfYearTaxoFacets 27.25 (37.2%) 30.01 (44.3%) 10.1% ( -52% - 145%) 0.433
BrowseRandomLabelTaxoFacets 25.82 (44.7%) 29.41 (60.5%) 13.9% ( -63% - 215%) 0.409
BrowseMonthTaxoFacets 25.92 (35.0%) 29.53 (45.3%) 13.9% ( -49% - 145%) 0.277
OrHighLow 885.95 (4.3%) 1142.96 (3.7%) 29.0% ( 20% - 38%) 0.000
OrHighHigh 46.72 (6.1%) 65.39 (5.8%) 40.0% ( 26% - 55%) 0.000
OrHighMed 144.59 (6.7%) 525.10 (11.3%) 263.2% ( 229% - 301%) 0.000
```
For
> One optimization it has that seemed to help that your scorer doesn't have is to check for every non-essential scorer whether the score obtained so far plus the sum of max scores of non essential scorers that haven't been checked yet is still competitive.
I implemented [similar logic](
https://github.com/apache/lucene/pull/972/commits/2f2abd7a133659e8c195afcc04e9435e6328c3e5#diff-6d2292dfdef5525e6205d3298d49b2c81c41ec41bea6e30148f3cf22f6a0f5d0R180-R198) to move the next least contributing essential scorer into non-essential scorer list when minCompetitiveScore increased, I feel the effect would be similar?
In terms of next steps, I'm wondering if there's a preference between bulk scorer and scorer implementations when performance improvement is similar (maybe one type of scorer can be used in more places) ? I've learnt a lot from both PRs already and either one looks like a good improvement for disjunction queries!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1168197720
> > I feel the effect would be similar?
>
> Indeed, sorry I had misread your code!
>
No worry, thanks still for the suggestion!
>
> No, it shouldn't matter. Bulk scorers sometimes help yield better performance because it's easier for them to amortize computation across docs, but if they don't yield better performance, there's no point in using a bulk scorer instead of a regular scorer.
Ok I see, makes sense.
> I agree that it looks like a great speedup, we should get this in! The benchmark only tests performance of top-level disjunctions of term queries that have two clauses. I'd be curious to get performance numbers for queries like the below ones to see if we need to fine-tune a bit more when this new scorer gets used. Note that I don't think we need to get the performance better for all these queries to merge the change, we could start by only using this new scorer for the (common) case of a top-level disjunction of 2 term queries, and later see if this scorer can handle more disjunctions.
>
> ```
> OrAndHigMedAndHighMed: (+including +looking) (+date +finished) # disjunction of conjunctions, which don't have as good score upper bounds as term queries
> OrHighPhraseHighPhrase: "united states" "new york" # disjunction of phrase queries, which don't have as good score upper bounds as term queries and are slow to advance
> AndHighOrMedMed: +be +(mostly interview) # disjunction within conjunction that leads iteration
> AndMedOrHighHigh: +interview +(at united) # disjunction within conjunction that doesn't lead iteration
> ```
Sounds good! I have run these queries through benchmark and the results look somewhat consistent:
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighPhraseHighPhrase 28.89 (8.7%) 24.19 (4.7%) -16.3% ( -27% - -3%) 0.000
AndHighOrMedMed 101.24 (6.6%) 101.09 (3.0%) -0.1% ( -9% - 10%) 0.927
AndMedOrHighHigh 81.44 (6.3%) 81.62 (3.7%) 0.2% ( -9% - 10%) 0.895
OrAndHigMedAndHighMed 128.26 (7.0%) 136.94 (3.7%) 6.8% ( -3% - 18%) 0.000
PKLookup 221.47 (11.7%) 236.93 (9.1%) 7.0% ( -12% - 31%) 0.035
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighPhraseHighPhrase 27.73 (9.1%) 23.73 (4.6%) -14.4% ( -25% - 0%) 0.000
AndHighOrMedMed 97.09 (13.1%) 99.30 (4.3%) 2.3% ( -13% - 22%) 0.462
AndMedOrHighHigh 75.87 (15.2%) 80.04 (5.7%) 5.5% ( -13% - 31%) 0.128
PKLookup 219.70 (15.7%) 238.75 (12.4%) 8.7% ( -16% - 43%) 0.053
OrAndHigMedAndHighMed 121.83 (13.7%) 134.79 (4.4%) 10.6% ( -6% - 33%) 0.001
```
```
TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value
OrHighPhraseHighPhrase 27.42 (16.2%) 23.99 (4.0%) -12.5% ( -28% - 9%) 0.001
AndHighOrMedMed 96.61 (15.8%) 100.09 (3.6%) 3.6% ( -13% - 27%) 0.321
AndMedOrHighHigh 75.72 (16.8%) 79.53 (4.9%) 5.0% ( -14% - 32%) 0.200
OrAndHigMedAndHighMed 122.33 (16.9%) 136.60 (4.5%) 11.7% ( -8% - 39%) 0.003
PKLookup 207.94 (21.6%) 233.10 (16.5%) 12.1% ( -21% - 63%) 0.046
```
Looks like we may need to restrict the scorer to only term queries, or improve it for phrase queries?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org
[GitHub] [lucene] zacharymorn commented on pull request #972: LUCENE-10480: Use BMM scorer for 2 clauses disjunction
Posted by GitBox <gi...@apache.org>.
zacharymorn commented on PR #972:
URL: https://github.com/apache/lucene/pull/972#issuecomment-1168202563
For `OrHighPhraseHighPhrase`, the JFR CPU sampling result looks similar, but with the modified version calling `advanceShallow` more often, suggesting the BMM implementation might be doing boundary adjustment more often?
Modified:
```
PERCENT CPU SAMPLES STACK
8.63% 1389 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#advance()
5.24% 843 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#advanceShallow()
3.18% 511 java.nio.DirectByteBuffer#get()
2.79% 449 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader#findFirstGreater()
2.72% 438 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#refillPositions()
2.48% 399 jdk.internal.misc.ScopedMemoryAccess#getByteInternal()
2.19% 353 org.apache.lucene.search.ExactPhraseMatcher$1#getImpacts()
2.11% 339 org.apache.lucene.search.PhraseScorer$1#matches()
2.06% 331 org.apache.lucene.codecs.lucene90.Lucene90ScoreSkipReader#skipTo()
1.83% 294 org.apache.lucene.search.ExactPhraseMatcher$1$1#getImpacts()
1.63% 263 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#nextPosition()
1.49% 240 org.apache.lucene.store.ByteBufferGuard#getByte()
1.24% 200 org.apache.lucene.search.ExactPhraseMatcher#advancePosition()
1.24% 200 org.apache.lucene.search.ConjunctionDISI#doNext()
1.21% 194 java.util.zip.Inflater#inflateBytesBytes()
1.18% 190 org.apache.lucene.search.ExactPhraseMatcher#nextMatch()
1.13% 182 org.apache.lucene.store.DataInput#readVLong()
1.12% 181 org.apache.lucene.search.ExactPhraseMatcher$1#advanceShallow()
1.11% 178 org.apache.lucene.search.ImpactsDISI#advanceShallow()
1.07% 172 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#skipPositions()
0.89% 143 java.lang.Class#isArray()
0.81% 131 org.apache.lucene.codecs.lucene90.ForUtil#expand8()
0.75% 121 org.apache.lucene.search.TwoPhaseIterator$TwoPhaseIteratorAsDocIdSetIterator#doNext()
0.74% 119 org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32()
0.71% 115 org.apache.lucene.search.ConjunctionDISI#docID()
0.71% 115 org.apache.lucene.codecs.lucene90.ForUtil#shiftLongs()
0.70% 113 org.apache.lucene.search.PhraseScorer#docID()
0.70% 112 org.apache.lucene.codecs.lucene90.PForUtil#decode()
0.68% 110 org.apache.lucene.search.ExactPhraseMatcher#maxFreq()
0.68% 109 org.apache.lucene.search.ImpactsDISI#docID()
```
Baseline:
```
PERCENT CPU SAMPLES STACK
8.66% 1196 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#advance()
3.88% 536 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader#findFirstGreater()
2.96% 409 java.nio.DirectByteBuffer#get()
2.78% 384 org.apache.lucene.search.ExactPhraseMatcher$1$1#getImpacts()
2.50% 345 org.apache.lucene.codecs.lucene90.Lucene90ScoreSkipReader#skipTo()
2.46% 340 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#advanceShallow()
1.73% 239 org.apache.lucene.search.PhraseScorer$1#matches()
1.72% 237 org.apache.lucene.search.ExactPhraseMatcher$1#getImpacts()
1.48% 204 java.util.zip.Inflater#inflateBytesBytes()
1.48% 204 org.apache.lucene.codecs.lucene90.ForUtil#expand8()
1.23% 170 jdk.internal.misc.ScopedMemoryAccess#getByteInternal()
1.21% 167 org.apache.lucene.search.ConjunctionDISI#doNext()
1.20% 166 org.apache.lucene.codecs.lucene90.PForUtil#innerPrefixSum32()
1.19% 165 org.apache.lucene.store.ByteBufferGuard#getByte()
1.12% 155 org.apache.lucene.codecs.lucene90.PForUtil#prefixSum32()
1.07% 148 java.lang.Class#isArray()
1.06% 147 org.apache.lucene.codecs.lucene90.PForUtil#expand32()
0.98% 135 org.apache.lucene.codecs.lucene90.PForUtil#decode()
0.96% 133 org.apache.lucene.search.ConjunctionDISI#docID()
0.91% 125 org.apache.lucene.search.ExactPhraseMatcher#reset()
0.83% 114 perf.PKLookupTask#go()
0.83% 114 org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockImpactsPostingsEnum#refillPositions()
0.80% 111 org.apache.lucene.search.TwoPhaseIterator$TwoPhaseIteratorAsDocIdSetIterator#doNext()
0.80% 110 org.apache.lucene.search.ImpactsDISI#advanceShallow()
0.78% 108 org.apache.lucene.codecs.lucene90.ForUtil#shiftLongs()
0.78% 108 java.nio.Buffer#position()
0.78% 108 java.nio.Buffer#nextGetIndex()
0.76% 105 org.apache.lucene.store.ByteBufferGuard#ensureValid()
0.71% 98 org.apache.lucene.search.ConjunctionDISI#advance()
0.70% 97 org.apache.lucene.util.PriorityQueue#downHeap()
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org