You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/29 12:22:37 UTC
[lucene] branch main updated: DrillSideways optimizations (#11803)

This is an automated email from the ASF dual-hosted git repository.

gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new d02ba3134ff DrillSideways optimizations (#11803)
d02ba3134ff is described below

commit d02ba3134ff58f578e31fad739095929db82db8a
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Thu Sep 29 05:22:30 2022 -0700

    DrillSideways optimizations (#11803)
    
    DrillSidewaysScorer now breaks up first- and second-phase matching and makes use of advance when possible over nextDoc.
---
 lucene/CHANGES.txt                                 |   3 +
 .../apache/lucene/facet/DrillSidewaysScorer.java   | 120 ++++++++++++++++-----
 .../org/apache/lucene/facet/TestDrillSideways.java |  14 ++-
 3 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8abaecf3d92..80fb2047f55 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -129,6 +129,9 @@ Optimizations
 * GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
   (Luke Kot-Zaniewski)
 
+* GITHUB#11797: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
+  possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
index bf006f36a4a..360cb0bf74a 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@@ -17,8 +17,12 @@
 package org.apache.lucene.facet;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.BulkScorer;
@@ -30,10 +34,22 @@ import org.apache.lucene.search.ScoreCachingWrappingScorer;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.FixedBitSet;
 
 class DrillSidewaysScorer extends BulkScorer {
 
+  private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
+      Comparator.comparingLong(e -> e.approximation.cost());
+
+  private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
+      new Comparator<DocsAndCost>() {
+        @Override
+        public int compare(DocsAndCost o1, DocsAndCost o2) {
+          return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
+        }
+      };
+
   // private static boolean DEBUG = false;
 
   private final Collector drillDownCollector;
@@ -44,6 +60,8 @@ class DrillSidewaysScorer extends BulkScorer {
   // DrillDown DocsEnums:
   private final Scorer baseScorer;
   private final DocIdSetIterator baseIterator;
+  private final DocIdSetIterator baseApproximation;
+  private final TwoPhaseIterator baseTwoPhase;
 
   private final LeafReaderContext context;
 
@@ -65,6 +83,12 @@ class DrillSidewaysScorer extends BulkScorer {
     this.context = context;
     this.baseScorer = baseScorer;
     this.baseIterator = baseScorer.iterator();
+    this.baseTwoPhase = baseScorer.twoPhaseIterator();
+    if (baseTwoPhase != null) {
+      this.baseApproximation = baseTwoPhase.approximation();
+    } else {
+      this.baseApproximation = baseIterator;
+    }
     this.drillDownCollector = drillDownCollector;
     this.scoreSubDocsAtOnce = scoreSubDocsAtOnce;
   }
@@ -149,60 +173,98 @@ class DrillSidewaysScorer extends BulkScorer {
    */
   private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
       throws IOException {
-    // if (DEBUG) {
-    //  System.out.println("  doQueryFirstScoring");
-    // }
     setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
 
-    int docID = baseScorer.docID();
+    List<DocsAndCost> allDims = Arrays.asList(dims);
+    CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
+
+    List<DocsAndCost> twoPhaseDims = null;
+    for (DocsAndCost dim : dims) {
+      if (dim.twoPhase != null) {
+        if (twoPhaseDims == null) {
+          twoPhaseDims = new ArrayList<>(dims.length);
+        }
+        twoPhaseDims.add(dim);
+      }
+    }
+    if (twoPhaseDims != null) {
+      CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
+    }
+
+    int docID = baseApproximation.docID();
 
     nextDoc:
     while (docID != PostingsEnum.NO_MORE_DOCS) {
+      assert docID == baseApproximation.docID();
+
       if (acceptDocs != null && acceptDocs.get(docID) == false) {
-        docID = baseIterator.nextDoc();
+        docID = baseApproximation.nextDoc();
         continue;
       }
-      LeafCollector failedCollector = null;
-      for (DocsAndCost dim : dims) {
-        // TODO: should we sort this 2nd dimension of
-        // docsEnums from most frequent to least?
+
+      DocsAndCost failedDim = null;
+      for (DocsAndCost dim : allDims) {
+        final int dimDocID;
         if (dim.approximation.docID() < docID) {
-          dim.approximation.advance(docID);
+          dimDocID = dim.approximation.advance(docID);
+        } else {
+          dimDocID = dim.approximation.docID();
         }
-
-        boolean matches = false;
-        if (dim.approximation.docID() == docID) {
-          if (dim.twoPhase == null) {
-            matches = true;
+        if (dimDocID != docID) {
+          if (failedDim != null) {
+            int next = Math.min(dimDocID, failedDim.approximation.docID());
+            docID = baseApproximation.advance(next);
+            continue nextDoc;
           } else {
-            matches = dim.twoPhase.matches();
+            failedDim = dim;
           }
         }
+      }
 
-        if (matches == false) {
-          if (failedCollector != null) {
-            // More than one dim fails on this document, so
-            // it's neither a hit nor a near-miss; move to
-            // next doc:
-            docID = baseIterator.nextDoc();
-            continue nextDoc;
-          } else {
-            failedCollector = dim.sidewaysLeafCollector;
+      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      if (twoPhaseDims != null) {
+        if (failedDim == null) {
+          for (DocsAndCost dim : twoPhaseDims) {
+            assert dim.approximation.docID() == baseApproximation.docID();
+            if (dim.twoPhase.matches() == false) {
+              if (failedDim != null) {
+                int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
+                docID = baseApproximation.advance(next);
+                continue nextDoc;
+              } else {
+                failedDim = dim;
+              }
+            }
+          }
+        } else {
+          for (DocsAndCost dim : twoPhaseDims) {
+            if (failedDim == dim) {
+              continue;
+            }
+            assert dim.approximation.docID() == baseApproximation.docID();
+            if (dim.twoPhase.matches() == false) {
+              int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
+              docID = baseApproximation.advance(next);
+              continue nextDoc;
+            }
           }
         }
       }
 
       collectDocID = docID;
-
-      if (failedCollector == null) {
+      if (failedDim == null) {
         // Hit passed all filters, so it's "real":
         collectHit(collector, dims);
       } else {
         // Hit missed exactly one filter:
-        collectNearMiss(failedCollector);
+        collectNearMiss(failedDim.sidewaysLeafCollector);
       }
 
-      docID = baseIterator.nextDoc();
+      docID = baseApproximation.nextDoc();
     }
   }
 
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
index 7274bfe7a45..36382041a5c 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@@ -33,6 +33,7 @@ import java.util.stream.Collectors;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
 import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
@@ -1051,6 +1052,7 @@ public class TestDrillSideways extends FacetTestCase {
             doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue]));
           }
           doc.add(new StringField("dim" + dim, dimValues[dim][dimValue], Field.Store.YES));
+          doc.add(new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue])));
           if (VERBOSE) {
             System.out.println("    dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
           }
@@ -1063,6 +1065,8 @@ public class TestDrillSideways extends FacetTestCase {
             doc.add(new FacetField("dim" + dim, dimValues[dim][dimValue2]));
           }
           doc.add(new StringField("dim" + dim, dimValues[dim][dimValue2], Field.Store.YES));
+          doc.add(
+              new SortedSetDocValuesField("dim" + dim, new BytesRef(dimValues[dim][dimValue2])));
           if (VERBOSE) {
             System.out.println("      dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
           }
@@ -1188,7 +1192,15 @@ public class TestDrillSideways extends FacetTestCase {
       for (int dim = 0; dim < numDims; dim++) {
         if (drillDowns[dim] != null) {
           for (String value : drillDowns[dim]) {
-            ddq.add("dim" + dim, value);
+            // Sometimes use a "traditional" term query and sometimes use a two-phase approach to
+            // ensure code coverage:
+            if (random().nextBoolean()) {
+              ddq.add("dim" + dim, value);
+            } else {
+              ddq.add(
+                  "dim" + dim,
+                  SortedSetDocValuesField.newSlowExactQuery("dim" + dim, new BytesRef(value)));
+            }
           }
         }
       }