You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/11/08 18:08:18 UTC
[lucene] branch main updated: Further optimize DrillSideways scoring (#11881)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new c66a5590500 Further optimize DrillSideways scoring (#11881)
c66a5590500 is described below
commit c66a559050048a28d1528c7f562367c8738fe90b
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Tue Nov 8 10:08:12 2022 -0800
Further optimize DrillSideways scoring (#11881)
---
lucene/CHANGES.txt | 3 +
.../apache/lucene/facet/DrillSidewaysScorer.java | 186 ++++++++++++++++-----
.../org/apache/lucene/facet/TestDrillSideways.java | 37 ++--
3 files changed, 157 insertions(+), 69 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bb0bb3539bc..00268e2b9af 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -160,6 +160,9 @@ Optimizations
* GITHUB#11876: Use ByteArrayComparator to speed up PointInSetQuery in single dimension case.
(Guo Feng)
+* GITHUB#11881: Further optimize drill-sideways scoring by specializing the single dimension case
+ and borrowing some concepts from "min should match" scoring. (Greg Miller)
+
* GITHUB#11884: Simplify the logic of matchAll() in IndexSortSortedNumericDocValuesRangeQuery. (Lu Xugang)
Other
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
index 360cb0bf74a..fc415e14897 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@@ -18,13 +18,11 @@ package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
@@ -36,19 +34,15 @@ import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.PriorityQueue;
class DrillSidewaysScorer extends BulkScorer {
private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
- Comparator.comparingLong(e -> e.approximation.cost());
+ Comparator.comparingLong(e -> e.cost);
private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
- new Comparator<DocsAndCost>() {
- @Override
- public int compare(DocsAndCost o1, DocsAndCost o2) {
- return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
- }
- };
+ (a, b) -> Float.compare(a.matchCost, b.matchCost);
// private static boolean DEBUG = false;
@@ -166,35 +160,92 @@ class DrillSidewaysScorer extends BulkScorer {
return Integer.MAX_VALUE;
}
+ /**
+ * Query-first scoring specialization when there is only one drill-sideways dimension, which is
+ * likely a common scenario.
+ */
+ private void doQueryFirstScoringSingleDim(
+ Bits acceptDocs, LeafCollector collector, DocsAndCost dim) throws IOException {
+ int docID = baseApproximation.docID();
+ while (docID != DocIdSetIterator.NO_MORE_DOCS) {
+ assert docID == baseApproximation.docID();
+
+ if (acceptDocs != null && acceptDocs.get(docID) == false) {
+ docID = baseApproximation.nextDoc();
+ continue;
+ }
+
+ if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+ docID = baseApproximation.nextDoc();
+ continue;
+ }
+
+ // We have either a near-miss or full match. Check the sideways dim to see which it is:
+ collectDocID = docID;
+ if (advanceIfBehind(docID, dim.approximation) != docID
+ || (dim.twoPhase != null && dim.twoPhase.matches() == false)) {
+ // The sideways dim missed, so we have a "near miss":
+ collectNearMiss(dim.sidewaysLeafCollector);
+ } else {
+ // Hit passed all filters, so it's "real":
+ collectHit(collector, dim);
+ }
+
+ docID = baseApproximation.nextDoc();
+ }
+ }
+
/**
* Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
- * at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base
- * and .advance() on the dim filters.
+ * at once (i.e., like BooleanScorer2, not BooleanScorer).
*/
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
throws IOException {
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
- List<DocsAndCost> allDims = Arrays.asList(dims);
- CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
+ // Specialize the single-dim use-case as we have a more efficient implementation for that:
+ if (dims.length == 1) {
+ doQueryFirstScoringSingleDim(acceptDocs, collector, dims[0]);
+ return;
+ }
+
+ // Sort our sideways dims by approximation cost so we can advance the lower cost ones first:
+ List<DocsAndCost> sidewaysDims = new ArrayList<>(dims.length);
+ sidewaysDims.addAll(List.of(dims));
+ CollectionUtil.timSort(sidewaysDims, APPROXIMATION_COMPARATOR);
- List<DocsAndCost> twoPhaseDims = null;
+ // Maintain (optional) subset of sideways dims that support two-phase iteration, sorted by
+ // matchCost:
+ List<DocsAndCost> sidewaysTwoPhaseDims = null;
for (DocsAndCost dim : dims) {
if (dim.twoPhase != null) {
- if (twoPhaseDims == null) {
- twoPhaseDims = new ArrayList<>(dims.length);
+ if (sidewaysTwoPhaseDims == null) {
+ sidewaysTwoPhaseDims = new ArrayList<>();
}
- twoPhaseDims.add(dim);
+ sidewaysTwoPhaseDims.add(dim);
}
}
- if (twoPhaseDims != null) {
- CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
+ if (sidewaysTwoPhaseDims != null) {
+ CollectionUtil.timSort(sidewaysTwoPhaseDims, TWO_PHASE_COMPARATOR);
}
+ // We keep track of a "runaway" dimension, which is a previously "near missed" dimension that
+ // has advanced beyond the docID the rest of the dimensions are positioned on. This functions
+ // a bit like the "head" queue in WANDScorer's "min should match" implementation. We use a
+ // single-valued PQ ordered by docID to easily determine the "closest" runaway dim we'll use
+ // for advancing in the case that multiple dim approximations miss.
+ PriorityQueue<DocsAndCost> runawayDim =
+ new PriorityQueue<>(1) {
+ @Override
+ protected boolean lessThan(DocsAndCost a, DocsAndCost b) {
+ return a.approximation.docID() < b.approximation.docID();
+ }
+ };
+
int docID = baseApproximation.docID();
nextDoc:
- while (docID != PostingsEnum.NO_MORE_DOCS) {
+ while (docID != DocIdSetIterator.NO_MORE_DOCS) {
assert docID == baseApproximation.docID();
if (acceptDocs != null && acceptDocs.get(docID) == false) {
@@ -202,38 +253,50 @@ class DrillSidewaysScorer extends BulkScorer {
continue;
}
- DocsAndCost failedDim = null;
- for (DocsAndCost dim : allDims) {
- final int dimDocID;
- if (dim.approximation.docID() < docID) {
- dimDocID = dim.approximation.advance(docID);
- } else {
- dimDocID = dim.approximation.docID();
- }
- if (dimDocID != docID) {
- if (failedDim != null) {
- int next = Math.min(dimDocID, failedDim.approximation.docID());
+ // If we carried a "runaway" over from the last iteration, see if we've "caught up" yet:
+ DocsAndCost runaway = runawayDim.top();
+ if (runaway != null && runaway.approximation.docID() <= docID) {
+ runawayDim.clear();
+ runaway = null;
+ }
+
+ // Check the sideways dim approximations. At most, one dim is allowed to miss for the doc
+ // to be a near-miss or full match. If multiple sideways dims miss, we move on:
+ for (DocsAndCost dim : sidewaysDims) {
+ int dimDocID = advanceIfBehind(docID, dim.approximation);
+ if (dimDocID != docID && dim != runaway) {
+ DocsAndCost evicted = runawayDim.insertWithOverflow(dim);
+ if (evicted != null) {
+ // More than one dim has advanced beyond docID, so we jump ahead to the "closer" of
+ // the two:
+ int next = evicted.approximation.docID();
docID = baseApproximation.advance(next);
continue nextDoc;
- } else {
- failedDim = dim;
}
}
}
+ // At this point, we have an "approximate" near-miss or full match, but we still need
+ // to confirm two-phase iterators. First, check the base two-phase (it's always required):
if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
docID = baseApproximation.nextDoc();
continue;
}
- if (twoPhaseDims != null) {
+ // If we have two-phase iterators for our sideways dims, check them now. At most, one
+ // sideways dim can miss for the doc to be a near-miss or full match. If more than one misses
+ // we move on:
+ DocsAndCost failedDim = runawayDim.top();
+ if (sidewaysTwoPhaseDims != null) {
if (failedDim == null) {
- for (DocsAndCost dim : twoPhaseDims) {
- assert dim.approximation.docID() == baseApproximation.docID();
+ // If all sideways dims matched in their approximation phase, then we can allow one
+ // second-phase check to fail:
+ for (DocsAndCost dim : sidewaysTwoPhaseDims) {
+ assert dim.approximation.docID() == docID;
if (dim.twoPhase.matches() == false) {
if (failedDim != null) {
- int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
- docID = baseApproximation.advance(next);
+ // Two second-phase checks have failed, so we move on:
+ docID = baseApproximation.nextDoc();
continue nextDoc;
} else {
failedDim = dim;
@@ -241,14 +304,14 @@ class DrillSidewaysScorer extends BulkScorer {
}
}
} else {
- for (DocsAndCost dim : twoPhaseDims) {
+ // If a sideways dim failed the approximate check, then no second-phase checks can fail:
+ for (DocsAndCost dim : sidewaysTwoPhaseDims) {
if (failedDim == dim) {
continue;
}
- assert dim.approximation.docID() == baseApproximation.docID();
+ assert dim.approximation.docID() == docID;
if (dim.twoPhase.matches() == false) {
- int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
- docID = baseApproximation.advance(next);
+ docID = baseApproximation.nextDoc();
continue nextDoc;
}
}
@@ -258,9 +321,9 @@ class DrillSidewaysScorer extends BulkScorer {
collectDocID = docID;
if (failedDim == null) {
// Hit passed all filters, so it's "real":
- collectHit(collector, dims);
+ collectHit(collector, sidewaysDims);
} else {
- // Hit missed exactly one filter:
+ // Hit missed exactly one dim:
collectNearMiss(failedDim.sidewaysLeafCollector);
}
@@ -268,6 +331,14 @@ class DrillSidewaysScorer extends BulkScorer {
}
}
+ private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
+ if (iterator.docID() < docID) {
+ return iterator.advance(docID);
+ } else {
+ return iterator.docID();
+ }
+ }
+
/** Used when drill downs are highly constraining vs baseQuery. */
private void doDrillDownAdvanceScoring(
Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
@@ -651,6 +722,28 @@ class DrillSidewaysScorer extends BulkScorer {
}
}
+ private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException {
+ collector.collect(collectDocID);
+ if (drillDownCollector != null) {
+ drillDownLeafCollector.collect(collectDocID);
+ }
+
+ // Tally sideways count:
+ dim.sidewaysLeafCollector.collect(collectDocID);
+ }
+
+ private void collectHit(LeafCollector collector, List<DocsAndCost> dims) throws IOException {
+ collector.collect(collectDocID);
+ if (drillDownCollector != null) {
+ drillDownLeafCollector.collect(collectDocID);
+ }
+
+ // Tally sideways counts:
+ for (DocsAndCost dim : dims) {
+ dim.sidewaysLeafCollector.collect(collectDocID);
+ }
+ }
+
private void collectNearMiss(LeafCollector sidewaysCollector) throws IOException {
// if (DEBUG) {
// System.out.println(" missingDim=" + dim);
@@ -689,8 +782,10 @@ class DrillSidewaysScorer extends BulkScorer {
static class DocsAndCost {
// approximation of matching docs, or the scorer itself
final DocIdSetIterator approximation;
+ final long cost;
// two-phase confirmation, or null if the approximation is accurate
final TwoPhaseIterator twoPhase;
+ final float matchCost;
final Collector sidewaysCollector;
LeafCollector sidewaysLeafCollector;
@@ -699,10 +794,13 @@ class DrillSidewaysScorer extends BulkScorer {
if (twoPhase == null) {
this.approximation = scorer.iterator();
this.twoPhase = null;
+ this.matchCost = 0f;
} else {
this.approximation = twoPhase.approximation();
this.twoPhase = twoPhase;
+ this.matchCost = twoPhase.matchCost();
}
+ this.cost = approximation.cost();
this.sidewaysCollector = sidewaysCollector;
}
}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
index b479b752c4d..04668aff4b1 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@@ -297,36 +297,26 @@ public class TestDrillSideways extends FacetTestCase {
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
+ // Run all the basic test cases with a standard DrillSideways implementation:
DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
+ runDrillSidewaysTestCases(config, ds);
+ // Run all the basic test cases but make sure DS is set to score all sub-docs at once, so
+ // we exercise the doc-at-a-time scoring methodology:
+ ds = getNewDrillSidewaysScoreSubdocsAtOnce(searcher, config, taxoReader);
+ runDrillSidewaysTestCases(config, ds);
+
+ writer.close();
+ IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
+ }
+
+ private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) throws Exception {
// case: drill-down on a single field; in this
// case the drill-sideways + drill-down counts ==
// drill-down of just the query:
DrillDownQuery ddq = new DrillDownQuery(config);
ddq.add("Author", "Lisa");
DrillSidewaysResult r = ds.search(null, ddq, 10);
- assertEquals(2, r.hits.totalHits.value);
- // Publish Date is only drill-down, and Lisa published
- // one in 2012 and one in 2010:
- assertEquals(
- "dim=Publish Date path=[] value=2 childCount=2\n 2010 (1)\n 2012 (1)\n",
- r.facets.getTopChildren(10, "Publish Date").toString());
-
- // Author is drill-sideways + drill-down: Lisa
- // (drill-down) published twice, and Frank/Susan/Bob
- // published once:
- assertEquals(
- "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
- r.facets.getTopChildren(10, "Author").toString());
-
- // Same simple case, but no baseQuery (pure browse):
- // drill-down on a single field; in this case the
- // drill-sideways + drill-down counts == drill-down of
- // just the query:
- ddq = new DrillDownQuery(config);
- ddq.add("Author", "Lisa");
- r = ds.search(null, ddq, 10);
-
assertEquals(2, r.hits.totalHits.value);
// Publish Date is only drill-down, and Lisa published
// one in 2012 and one in 2010:
@@ -484,9 +474,6 @@ public class TestDrillSideways extends FacetTestCase {
() -> {
finalR.facets.getTopChildren(0, "Author");
});
-
- writer.close();
- IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
}
public void testBasicWithCollectorManager() throws Exception {