You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/29 12:22:37 UTC
[lucene] branch main updated: DrillSideways optimizations (#11803)
This is an automated email from the ASF dual-hosted git repository.
gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new d02ba3134ff DrillSideways optimizations (#11803)
d02ba3134ff is described below
commit d02ba3134ff58f578e31fad739095929db82db8a
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Thu Sep 29 05:22:30 2022 -0700
DrillSideways optimizations (#11803)
DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
---
lucene/CHANGES.txt | 3 +
.../apache/lucene/facet/DrillSidewaysScorer.java | 120 ++++++++++++++++-----
.../org/apache/lucene/facet/TestDrillSideways.java | 14 ++-
3 files changed, 107 insertions(+), 30 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8abaecf3d92..80fb2047f55 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -129,6 +129,9 @@ Optimizations
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
(Luke Kot-Zaniewski)
+* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
+ possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
+
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
index bf006f36a4a..360cb0bf74a 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@@ -17,8 +17,12 @@
package org.apache.lucene.facet;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.BulkScorer;
@@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.FixedBitSet;
class DrillSidewaysScorer extends BulkScorer {
+ private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
+ Comparator.comparingLong(e -> e.approximation.cost());
+
+ private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
+ new Comparator<DocsAndCost>() {
+ @Override
+ public int compare(DocsAndCost o1, DocsAndCost o2) {
+ return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
+ }
+ };
+
// private static boolean DEBUG = false;
private final Collector drillDownCollector;
@@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
// DrillDown DocsEnums:
private final Scorer baseScorer;
private final DocIdSetIterator baseIterator;
+ private final DocIdSetIterator baseApproximation;
+ private final TwoPhaseIterator baseTwoPhase;
private final LeafReaderContext context;
@@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
this.context = context;
this.baseScorer = baseScorer;
this.baseIterator = baseScorer.iterator();
+ this.baseTwoPhase = baseScorer.twoPhaseIterator();
+ if (baseTwoPhase != null) {
+ this.baseApproximation = baseTwoPhase.approximation();
+ } else {
+ this.baseApproximation = baseIterator;
+ }
this.drillDownCollector = drillDownCollector;
this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
}
@@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
*/
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
throws IOException {
- // if (DEBUG) {
- // System.out.println(" doQueryFirstScoring");
- // }
setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
- int docID = baseScorer.docID();
+ List<DocsAndCost> allDims = Arrays.asList(dims);
+ CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
+
+ List<DocsAndCost> twoPhaseDims = null;
+ for (DocsAndCost dim : dims) {
+ if (dim.twoPhase != null) {
+ if (twoPhaseDims == null) {
+ twoPhaseDims = new ArrayList<>(dims.length);
+ }
+ twoPhaseDims.add(dim);
+ }
+ }
+ if (twoPhaseDims != null) {
+ CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
+ }
+
+ int docID = baseApproximation.docID();
nextDoc:
while (docID != PostingsEnum.NO_MORE_DOCS) {
+ assert docID == baseApproximation.docID();
+
if (acceptDocs != null && acceptDocs.get(docID) == false) {
- docID = baseIterator.nextDoc();
+ docID = baseApproximation.nextDoc();
continue;
}
- LeafCollector failedCollector = null;
- for (DocsAndCost dim : dims) {
- // TODO: should we sort this 2nd dimension of
- // docsEnums from most frequent to least?
+
+ DocsAndCost failedDim = null;
+ for (DocsAndCost dim : allDims) {
+ final int dimDocID;
if (dim.approximation.docID() < docID) {
- dim.approximation.advance(docID);
+ dimDocID = dim.approximation.advance(docID);
+ } else {
+ dimDocID = dim.approximation.docID();
}
-
- boolean matches = false;
- if (dim.approximation.docID() == docID) {
- if (dim.twoPhase == null) {
- matches = true;
+ if (dimDocID != docID) {
+ if (failedDim != null) {
+ int next = Math.min(dimDocID, failedDim.approximation.docID());
+ docID = baseApproximation.advance(next);
+ continue nextDoc;
} else {
- matches = dim.twoPhase.matches();
+ failedDim = dim;
}
}
+ }
- if (matches == false) {
- if (failedCollector != null) {
- // More than one dim fails on this document, so
- // it's neither a hit nor a near-miss; move to
- // next doc:
- docID = baseIterator.nextDoc();
- continue nextDoc;
- } else {
- failedCollector = dim.sidewaysLeafCollector;
+ if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+ docID = baseApproximation.nextDoc();
+ continue;
+ }
+
+ if (twoPhaseDims != null) {
+ if (failedDim == null) {
+ for (DocsAndCost dim : twoPhaseDims) {
+ assert dim.approximation.docID() == baseApproximation.docID();
+ if (dim.twoPhase.matches() == false) {
+ if (failedDim != null) {
+ int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
+ docID = baseApproximation.advance(next);
+ continue nextDoc;
+ } else {
+ failedDim = dim;
+ }
+ }
+ }
+ } else {
+ for (DocsAndCost dim : twoPhaseDims) {
+ if (failedDim == dim) {
+ continue;
+ }
+ assert dim.approximation.docID() == baseApproximation.docID();
+ if (dim.twoPhase.matches() == false) {
+ int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
+ docID = baseApproximation.advance(next);
+ continue nextDoc;
+ }
}
}
}
collectDocID = docID;
-
- if (failedCollector == null) {
+ if (failedDim == null) {
// Hit passed all filters, so it's "real":
collectHit(collector, dims);
} else {
// Hit missed exactly one filter:
- collectNearMiss(failedCollector);
+ collectNearMiss(failedDim.sidewaysLeafCollector);
}
- docID = baseIterator.nextDoc();
+ docID = baseApproximation.nextDoc();
}
}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
index 7274bfe7a45..36382041a5c 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@@ -33,6 +33,7 @@ import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
@@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
}
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
+ doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
}
@@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
}
doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
+ doc.add(
+ new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
}
@@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
for (int dim = 0; dim < numDims; dim++) {
if (drillDowns[dim] != null) {
for (String value : drillDowns[dim]) {
- ddq.add("dim" + dim, value);
+ // Sometimes use a "traditional" term query and sometimes use a two-phase approach to
+ // ensure code coverage:
+ if (random().nextBoolean()) {
+ ddq.add("dim" + dim, value);
+ } else {
+ ddq.add(
+ "dim" + dim,
+ SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
+ }
}
}
}