You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2022/09/26 20:41:56 UTC

[GitHub] [lucene] zhaih commented on a diff in pull request #11803: DrillSideways optimizations

zhaih commented on code in PR #11803:
URL: https://github.com/apache/lucene/pull/11803#discussion_r980469938


##########
lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java:
##########
@@ -149,63 +162,147 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int maxDoc)
    */
   private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
       throws IOException {
-    // if (DEBUG) {
-    //  System.out.println("  doQueryFirstScoring");
-    // }
     setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
 
-    int docID = baseScorer.docID();
+    List<DocsAndCost> allDims = Arrays.asList(dims);
+    CollectionUtil.timSort(allDims, Comparator.comparingLong(e -> e.approximation.cost()));
+
+    List<DocsAndCost> twoPhaseDims = null;
+    for (DocsAndCost dim : dims) {
+      if (dim.twoPhase != null) {
+        if (twoPhaseDims == null) {
+          twoPhaseDims = new ArrayList<>(dims.length);
+        }
+        twoPhaseDims.add(dim);
+      }
+    }
+    if (twoPhaseDims != null) {
+      CollectionUtil.timSort(
+          twoPhaseDims,
+          new Comparator<DocsAndCost>() {
+            @Override
+            public int compare(DocsAndCost o1, DocsAndCost o2) {
+              return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
+            }
+          });
+    }
+
+    int docID = baseApproximation.docID();
 
     nextDoc:
     while (docID != PostingsEnum.NO_MORE_DOCS) {
+      assert docID == baseApproximation.docID();
+
       if (acceptDocs != null && acceptDocs.get(docID) == false) {
-        docID = baseIterator.nextDoc();
+        docID = baseApproximation.nextDoc();
         continue;
       }
-      LeafCollector failedCollector = null;
-      for (DocsAndCost dim : dims) {
-        // TODO: should we sort this 2nd dimension of
-        // docsEnums from most frequent to least?
+
+      DocsAndCost failedDim = null;
+      for (DocsAndCost dim : allDims) {
+        final int dimDocID;
         if (dim.approximation.docID() < docID) {
-          dim.approximation.advance(docID);
+          dimDocID = dim.approximation.advance(docID);
+        } else {
+          dimDocID = dim.approximation.docID();
         }
-
-        boolean matches = false;
-        if (dim.approximation.docID() == docID) {
-          if (dim.twoPhase == null) {
-            matches = true;
+        if (dimDocID != docID) {
+          if (failedDim != null) {
+            int next = Math.min(dimDocID, failedDim.approximation.docID());
+            docID = baseApproximation.advance(next);
+            continue nextDoc;
           } else {
-            matches = dim.twoPhase.matches();
+            failedDim = dim;
           }
         }
+      }
 
-        if (matches == false) {
-          if (failedCollector != null) {
-            // More than one dim fails on this document, so
-            // it's neither a hit nor a near-miss; move to
-            // next doc:
-            docID = baseIterator.nextDoc();
-            continue nextDoc;
-          } else {
-            failedCollector = dim.sidewaysLeafCollector;
+      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      if (twoPhaseDims != null) {
+        if (failedDim == null) {
+          for (DocsAndCost dim : twoPhaseDims) {
+            assert dim.approximation.docID() == docID;
+            if (dim.twoPhase.matches() == false) {
+              if (failedDim != null) {
+                assert dim.approximation.docID() + 1 <= failedDim.approximation.docID();

Review Comment:
   Why is this assertion true? Shouldn't the two docID be the same and both equal to `docID` according to assertion in L228?



##########
lucene/facet/src/java/org/apache/lucene/facet/DrillSidewaysScorer.java:
##########
@@ -149,63 +162,147 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int maxDoc)
    */
   private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
       throws IOException {
-    // if (DEBUG) {
-    //  System.out.println("  doQueryFirstScoring");
-    // }
     setScorer(collector, ScoreCachingWrappingScorer.wrap(baseScorer));
 
-    int docID = baseScorer.docID();
+    List<DocsAndCost> allDims = Arrays.asList(dims);
+    CollectionUtil.timSort(allDims, Comparator.comparingLong(e -> e.approximation.cost()));
+
+    List<DocsAndCost> twoPhaseDims = null;
+    for (DocsAndCost dim : dims) {
+      if (dim.twoPhase != null) {
+        if (twoPhaseDims == null) {
+          twoPhaseDims = new ArrayList<>(dims.length);
+        }
+        twoPhaseDims.add(dim);
+      }
+    }
+    if (twoPhaseDims != null) {
+      CollectionUtil.timSort(
+          twoPhaseDims,
+          new Comparator<DocsAndCost>() {
+            @Override
+            public int compare(DocsAndCost o1, DocsAndCost o2) {
+              return Float.compare(o1.twoPhase.matchCost(), o2.twoPhase.matchCost());
+            }
+          });
+    }
+
+    int docID = baseApproximation.docID();
 
     nextDoc:
     while (docID != PostingsEnum.NO_MORE_DOCS) {
+      assert docID == baseApproximation.docID();
+
       if (acceptDocs != null && acceptDocs.get(docID) == false) {
-        docID = baseIterator.nextDoc();
+        docID = baseApproximation.nextDoc();
         continue;
       }
-      LeafCollector failedCollector = null;
-      for (DocsAndCost dim : dims) {
-        // TODO: should we sort this 2nd dimension of
-        // docsEnums from most frequent to least?
+
+      DocsAndCost failedDim = null;
+      for (DocsAndCost dim : allDims) {
+        final int dimDocID;
         if (dim.approximation.docID() < docID) {
-          dim.approximation.advance(docID);
+          dimDocID = dim.approximation.advance(docID);
+        } else {
+          dimDocID = dim.approximation.docID();
         }
-
-        boolean matches = false;
-        if (dim.approximation.docID() == docID) {
-          if (dim.twoPhase == null) {
-            matches = true;
+        if (dimDocID != docID) {
+          if (failedDim != null) {
+            int next = Math.min(dimDocID, failedDim.approximation.docID());
+            docID = baseApproximation.advance(next);
+            continue nextDoc;
           } else {
-            matches = dim.twoPhase.matches();
+            failedDim = dim;
           }
         }
+      }
 
-        if (matches == false) {
-          if (failedCollector != null) {
-            // More than one dim fails on this document, so
-            // it's neither a hit nor a near-miss; move to
-            // next doc:
-            docID = baseIterator.nextDoc();
-            continue nextDoc;
-          } else {
-            failedCollector = dim.sidewaysLeafCollector;
+      if (baseTwoPhase != null && baseTwoPhase.matches() == false) {
+        docID = baseApproximation.nextDoc();
+        continue;
+      }
+
+      if (twoPhaseDims != null) {
+        if (failedDim == null) {
+          for (DocsAndCost dim : twoPhaseDims) {
+            assert dim.approximation.docID() == docID;

Review Comment:
   I found it easier to understand if we assert `dim.approximation.docID() == baseApproximation.docID()` or simply add another line asserting `docID == baseApproximation.docID()`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org