You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/11/08 18:08:18 UTC
[lucene] branch main updated: Further optimize DrillSideways scoring (#11881)

This is an automated email from the ASF dual-hosted git repository.

gsmiller pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new c66a5590500 Further optimize DrillSideways scoring (#11881)
c66a5590500 is described below

commit c66a559050048a28d1528c7f562367c8738fe90b
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Tue Nov 8 10:08:12 2022 -0800

    Further optimize DrillSideways scoring (#11881)
---
 lucene/CHANGES.txt                                 |   3 +
 .../apache/lucene/facet/DrillSidewaysScorer.java   | 186 ++++++++++++++++-----
 .../org/apache/lucene/facet/TestDrillSideways.java |  37 ++--
 3 files changed, 157 insertions(+), 69 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index bb0bb3539bc..00268e2b9af 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -160,6 +160,9 @@ Optimizations
 * GITHUB#11876: Use ByteArrayComparator to speed up PointInSetQuery in single dimension case.
   (Guo Feng)
 
+* GITHUB#11881: Further optimize drill-sideways scoring by specializing the single dimension case
+  and borrowing some concepts from "min should match" scoring. (Greg Miller)
+
 * GITHUB#11884: Simplify the logic of matchAll() in IndexSortSortedNumericDocValuesRangeQuery. (Lu Xugang)
 
 Other
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
index 360cb0bf74a..fc415e14897 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java
@@ -18,13 +18,11 @@ package org.apache.lucene.facet;
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.BulkScorer;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.DocIdSetIterator;
@@ -36,19 +34,15 @@ import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.PriorityQueue;
 
 class DrillSidewaysScorer extends BulkScorer {
 
   private static final Comparator<DocsAndCost> APPROXIMATION_COMPARATOR =
-      Comparator.comparingLong(e -> e.approximation.cost());
+      Comparator.comparingLong(e -> e.cost);
 
   private static final Comparator<DocsAndCost> TWO_PHASE_COMPARATOR =
-      new Comparator<DocsAndCost>() {
-        @Override
-        public int compare(DocsAndCost o1, DocsAndCost o2) {
-          return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
-        }
-      };
+      (a, b) -> Float.compare(a.matchCost, b.matchCost);
 
   // private static boolean DEBUG = false;
 
@@ -166,35 +160,92 @@ class DrillSidewaysScorer extends BulkScorer {
     return Integer.MAX_VALUE;
   }
 
+  /**
+   * Query-first scoring specialization when there is only one drill-sideways dimension, which is
+   * likely a common scenario.
+   */
+  private void doQueryFirstScoringSingleDim(
+      Bits acceptDocs, LeafCollector collector, DocsAndCost dim) throws IOException {
+    int docID = baseApproximation.docID();
+    while (docID != DocIdSetIterator.NO_MORE_DOCS) {
+      assert docID == baseApproximation.docID();
+
+      if (acceptDocs != null && acceptDocs.get(docID) == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      // We have either a near-miss or full match. Check the sideways dim to see which it is:
+      collectDocID = docID;
+      if (advanceIfBehind(docID, dim.approximation) != docID
+          || (dim.twoPhase != null && dim.twoPhase.matches() == false)) {
+        // The sideways dim missed, so we have a "near miss":
+        collectNearMiss(dim.sidewaysLeafCollector);
+      } else {
+        // Hit passed all filters, so it's "real":
+        collectHit(collector, dim);
+      }
+
+      docID = baseApproximation.nextDoc();
+    }
+  }
+
   /**
    * Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
-   * at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base
-   * and .advance() on the dim filters.
+   * at once (i.e., like BooleanScorer2, not BooleanScorer).
    */
   private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
       throws IOException {
     setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
 
-    List<DocsAndCost> allDims = Arrays.asList(dims);
-    CollectionUtil.timSort(allDims, APPROXIMATION_COMPARATOR);
+    // Specialize the single-dim use-case as we have a more efficient implementation for that:
+    if (dims.length == 1) {
+      doQueryFirstScoringSingleDim(acceptDocs, collector, dims[0]);
+      return;
+    }
+
+    // Sort our sideways dims by approximation cost so we can advance the lower cost ones first:
+    List<DocsAndCost> sidewaysDims = new ArrayList<>(dims.length);
+    sidewaysDims.addAll(List.of(dims));
+    CollectionUtil.timSort(sidewaysDims, APPROXIMATION_COMPARATOR);
 
-    List<DocsAndCost> twoPhaseDims = null;
+    // Maintain (optional) subset of sideways dims that support two-phase iteration, sorted by
+    // matchCost:
+    List<DocsAndCost> sidewaysTwoPhaseDims = null;
     for (DocsAndCost dim : dims) {
       if (dim.twoPhase != null) {
-        if (twoPhaseDims == null) {
-          twoPhaseDims = new ArrayList<>(dims.length);
+        if (sidewaysTwoPhaseDims == null) {
+          sidewaysTwoPhaseDims = new ArrayList<>();
         }
-        twoPhaseDims.add(dim);
+        sidewaysTwoPhaseDims.add(dim);
       }
     }
-    if (twoPhaseDims != null) {
-      CollectionUtil.timSort(twoPhaseDims, TWO_PHASE_COMPARATOR);
+    if (sidewaysTwoPhaseDims != null) {
+      CollectionUtil.timSort(sidewaysTwoPhaseDims, TWO_PHASE_COMPARATOR);
     }
 
+    // We keep track of a "runaway" dimension, which is a previously "near missed" dimension that
+    // has advanced beyond the docID the rest of the dimensions are positioned on. This functions
+    // a bit like the "head" queue in WANDScorer's "min should match" implementation. We use a
+    // single-valued PQ ordered by docID to easily determine the "closest" runaway dim we'll use
+    // for advancing in the case that multiple dim approximations miss.
+    PriorityQueue<DocsAndCost> runawayDim =
+        new PriorityQueue<>(1) {
+          @Override
+          protected boolean lessThan(DocsAndCost a, DocsAndCost b) {
+            return a.approximation.docID() < b.approximation.docID();
+          }
+        };
+
     int docID = baseApproximation.docID();
 
     nextDoc:
-    while (docID != PostingsEnum.NO_MORE_DOCS) {
+    while (docID != DocIdSetIterator.NO_MORE_DOCS) {
       assert docID == baseApproximation.docID();
 
       if (acceptDocs != null && acceptDocs.get(docID) == false) {
@@ -202,38 +253,50 @@ class DrillSidewaysScorer extends BulkScorer {
         continue;
       }
 
-      DocsAndCost failedDim = null;
-      for (DocsAndCost dim : allDims) {
-        final int dimDocID;
-        if (dim.approximation.docID() < docID) {
-          dimDocID = dim.approximation.advance(docID);
-        } else {
-          dimDocID = dim.approximation.docID();
-        }
-        if (dimDocID != docID) {
-          if (failedDim != null) {
-            int next = Math.min(dimDocID, failedDim.approximation.docID());
+      // If we carried a "runaway" over from the last iteration, see if we've "caught up" yet:
+      DocsAndCost runaway = runawayDim.top();
+      if (runaway != null && runaway.approximation.docID() <= docID) {
+        runawayDim.clear();
+        runaway = null;
+      }
+
+      // Check the sideways dim approximations. At most, one dim is allowed to miss for the doc
+      // to be a near-miss or full match. If multiple sideways dims miss, we move on:
+      for (DocsAndCost dim : sidewaysDims) {
+        int dimDocID = advanceIfBehind(docID, dim.approximation);
+        if (dimDocID != docID && dim != runaway) {
+          DocsAndCost evicted = runawayDim.insertWithOverflow(dim);
+          if (evicted != null) {
+            // More than one dim has advanced beyond docID, so we jump ahead to the "closer" of
+            // the two:
+            int next = evicted.approximation.docID();
             docID = baseApproximation.advance(next);
             continue nextDoc;
-          } else {
-            failedDim = dim;
           }
         }
       }
 
+      // At this point, we have an "approximate" near-miss or full match, but we still need
+      // to confirm two-phase iterators. First, check the base two-phase (it's always required):
       if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
         docID = baseApproximation.nextDoc();
         continue;
       }
 
-      if (twoPhaseDims != null) {
+      // If we have two-phase iterators for our sideways dims, check them now. At most, one
+      // sideways dim can miss for the doc to be a near-miss or full match. If more than one misses
+      // we move on:
+      DocsAndCost failedDim = runawayDim.top();
+      if (sidewaysTwoPhaseDims != null) {
         if (failedDim == null) {
-          for (DocsAndCost dim : twoPhaseDims) {
-            assert dim.approximation.docID() == baseApproximation.docID();
+          // If all sideways dims matched in their approximation phase, then we can allow one
+          // second-phase check to fail:
+          for (DocsAndCost dim : sidewaysTwoPhaseDims) {
+            assert dim.approximation.docID() == docID;
             if (dim.twoPhase.matches() == false) {
               if (failedDim != null) {
-                int next = Math.min(dim.approximation.nextDoc(), failedDim.approximation.nextDoc());
-                docID = baseApproximation.advance(next);
+                // Two second-phase checks have failed, so we move on:
+                docID = baseApproximation.nextDoc();
                 continue nextDoc;
               } else {
                 failedDim = dim;
@@ -241,14 +304,14 @@ class DrillSidewaysScorer extends BulkScorer {
             }
           }
         } else {
-          for (DocsAndCost dim : twoPhaseDims) {
+          // If a sideways dim failed the approximate check, then no second-phase checks can fail:
+          for (DocsAndCost dim : sidewaysTwoPhaseDims) {
             if (failedDim == dim) {
               continue;
             }
-            assert dim.approximation.docID() == baseApproximation.docID();
+            assert dim.approximation.docID() == docID;
             if (dim.twoPhase.matches() == false) {
-              int next = Math.min(failedDim.approximation.docID(), dim.approximation.nextDoc());
-              docID = baseApproximation.advance(next);
+              docID = baseApproximation.nextDoc();
               continue nextDoc;
             }
           }
@@ -258,9 +321,9 @@ class DrillSidewaysScorer extends BulkScorer {
       collectDocID = docID;
       if (failedDim == null) {
         // Hit passed all filters, so it's "real":
-        collectHit(collector, dims);
+        collectHit(collector, sidewaysDims);
       } else {
-        // Hit missed exactly one filter:
+        // Hit missed exactly one dim:
         collectNearMiss(failedDim.sidewaysLeafCollector);
       }
 
@@ -268,6 +331,14 @@ class DrillSidewaysScorer extends BulkScorer {
     }
   }
 
+  private static int advanceIfBehind(int docID, DocIdSetIterator iterator) throws IOException {
+    if (iterator.docID() < docID) {
+      return iterator.advance(docID);
+    } else {
+      return iterator.docID();
+    }
+  }
+
   /** Used when drill downs are highly constraining vs baseQuery. */
   private void doDrillDownAdvanceScoring(
       Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
@@ -651,6 +722,28 @@ class DrillSidewaysScorer extends BulkScorer {
     }
   }
 
+  private void collectHit(LeafCollector collector, DocsAndCost dim) throws IOException {
+    collector.collect(collectDocID);
+    if (drillDownCollector != null) {
+      drillDownLeafCollector.collect(collectDocID);
+    }
+
+    // Tally sideways count:
+    dim.sidewaysLeafCollector.collect(collectDocID);
+  }
+
+  private void collectHit(LeafCollector collector, List<DocsAndCost> dims) throws IOException {
+    collector.collect(collectDocID);
+    if (drillDownCollector != null) {
+      drillDownLeafCollector.collect(collectDocID);
+    }
+
+    // Tally sideways counts:
+    for (DocsAndCost dim : dims) {
+      dim.sidewaysLeafCollector.collect(collectDocID);
+    }
+  }
+
   private void collectNearMiss(LeafCollector sidewaysCollector) throws IOException {
     // if (DEBUG) {
     //  System.out.println("      missingDim=" + dim);
@@ -689,8 +782,10 @@ class DrillSidewaysScorer extends BulkScorer {
   static class DocsAndCost {
     // approximation of matching docs, or the scorer itself
     final DocIdSetIterator approximation;
+    final long cost;
     // two-phase confirmation, or null if the approximation is accurate
     final TwoPhaseIterator twoPhase;
+    final float matchCost;
     final Collector sidewaysCollector;
     LeafCollector sidewaysLeafCollector;
 
@@ -699,10 +794,13 @@ class DrillSidewaysScorer extends BulkScorer {
       if (twoPhase == null) {
         this.approximation = scorer.iterator();
         this.twoPhase = null;
+        this.matchCost = 0f;
       } else {
         this.approximation = twoPhase.approximation();
         this.twoPhase = twoPhase;
+        this.matchCost = twoPhase.matchCost();
       }
+      this.cost = approximation.cost();
       this.sidewaysCollector = sidewaysCollector;
     }
   }
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
index b479b752c4d..04668aff4b1 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@@ -297,36 +297,26 @@ public class TestDrillSideways extends FacetTestCase {
     // NRT open
     TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
 
+    // Run all the basic test cases with a standard DrillSideways implementation:
     DrillSideways ds = getNewDrillSideways(searcher, config, taxoReader);
+    runDrillSidewaysTestCases(config, ds);
 
+    // Run all the basic test cases but make sure DS is set to score all sub-docs at once, so
+    // we exercise the doc-at-a-time scoring methodology:
+    ds = getNewDrillSidewaysScoreSubdocsAtOnce(searcher, config, taxoReader);
+    runDrillSidewaysTestCases(config, ds);
+
+    writer.close();
+    IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
+  }
+
+  private void runDrillSidewaysTestCases(FacetsConfig config, DrillSideways ds) throws Exception {
     //  case: drill-down on a single field; in this
     // case the drill-sideways + drill-down counts ==
     // drill-down of just the query:
     DrillDownQuery ddq = new DrillDownQuery(config);
     ddq.add("Author", "Lisa");
     DrillSidewaysResult r = ds.search(null, ddq, 10);
-    assertEquals(2, r.hits.totalHits.value);
-    // Publish Date is only drill-down, and Lisa published
-    // one in 2012 and one in 2010:
-    assertEquals(
-        "dim=Publish Date path=[] value=2 childCount=2\n  2010 (1)\n  2012 (1)\n",
-        r.facets.getTopChildren(10, "Publish Date").toString());
-
-    // Author is drill-sideways + drill-down: Lisa
-    // (drill-down) published twice, and Frank/Susan/Bob
-    // published once:
-    assertEquals(
-        "dim=Author path=[] value=5 childCount=4\n  Lisa (2)\n  Bob (1)\n  Susan (1)\n  Frank (1)\n",
-        r.facets.getTopChildren(10, "Author").toString());
-
-    // Same simple case, but no baseQuery (pure browse):
-    // drill-down on a single field; in this case the
-    // drill-sideways + drill-down counts == drill-down of
-    // just the query:
-    ddq = new DrillDownQuery(config);
-    ddq.add("Author", "Lisa");
-    r = ds.search(null, ddq, 10);
-
     assertEquals(2, r.hits.totalHits.value);
     // Publish Date is only drill-down, and Lisa published
     // one in 2012 and one in 2010:
@@ -484,9 +474,6 @@ public class TestDrillSideways extends FacetTestCase {
         () -> {
           finalR.facets.getTopChildren(0, "Author");
         });
-
-    writer.close();
-    IOUtils.close(searcher.getIndexReader(), taxoReader, taxoWriter, dir, taxoDir);
   }
 
   public void testBasicWithCollectorManager() throws Exception {