You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/07/29 06:46:14 UTC
[lucene] branch main updated: LUCENE-10031: Speed up SortedDocIdMerger on low-cardinality sort fields. (#221)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 0e6c314  LUCENE-10031: Speed up SortedDocIdMerger on low-cardinality sort fields. (#221)
0e6c314 is described below

commit 0e6c3146d7853d27037213dc58eddc16a0e05daa
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Jul 29 08:46:10 2021 +0200

    LUCENE-10031: Speed up SortedDocIdMerger on low-cardinality sort fields. (#221)
    
    When sorting by low-cardinality fields, the same sub remains current for long
    sequences of doc IDs. This speeds up SortedDocIdMerger a bit by extracting
    the sub that leads iteration.
---
 lucene/CHANGES.txt                                 |  3 +
 .../java/org/apache/lucene/index/DocIDMerger.java  | 99 ++++++++++------------
 2 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3c8120e..3405d83 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -402,6 +402,9 @@ Optimizations
 * LUCENE-10022: Rewrite empty DisjunctionMaxQuery to MatchNoDocsQuery.
   (David Harsha via Julie Tibshirani)
 
+* LUCENE-10031: Slightly faster segment merging for sorted indices.
+  (Adrien Grand)
+
 Bug Fixes
 ---------------------
 * LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
index 22802ad..ef8b213 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
@@ -48,6 +48,24 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
      * when done
      */
     public abstract int nextDoc() throws IOException;
+
+    /**
+     * Like {@link #nextDoc()} but skips over unmapped docs and returns the next mapped doc ID, or
+     * {@link DocIdSetIterator#NO_MORE_DOCS} when exhausted. This method sets {@link #mappedDocID}
+     * as a side effect.
+     */
+    public final int nextMappedDoc() throws IOException {
+      while (true) {
+        int doc = nextDoc();
+        if (doc == NO_MORE_DOCS) {
+          return this.mappedDocID = NO_MORE_DOCS;
+        }
+        int mappedDoc = docMap.get(doc);
+        if (mappedDoc != -1) {
+          return this.mappedDocID = mappedDoc;
+        }
+      }
+    }
   }
 
   /** Construct this from the provided subs, specifying the maximum sub count */
@@ -101,36 +119,31 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
 
     @Override
     public T next() throws IOException {
-      while (true) {
-        int docID = current.nextDoc();
-        if (docID == NO_MORE_DOCS) {
-          if (nextIndex == subs.size()) {
-            current = null;
-            return null;
-          }
-          current = subs.get(nextIndex);
-          nextIndex++;
-          continue;
-        }
-
-        int mappedDocID = current.docMap.get(docID);
-        if (mappedDocID != -1) {
-          current.mappedDocID = mappedDocID;
-          return current;
+      while (current.nextMappedDoc() == NO_MORE_DOCS) {
+        if (nextIndex == subs.size()) {
+          current = null;
+          return null;
         }
+        current = subs.get(nextIndex);
+        nextIndex++;
       }
+      return current;
     }
   }
 
   private static class SortedDocIDMerger<T extends DocIDMerger.Sub> extends DocIDMerger<T> {
 
     private final List<T> subs;
+    private T current;
     private final PriorityQueue<T> queue;
 
     private SortedDocIDMerger(List<T> subs, int maxCount) throws IOException {
+      if (maxCount <= 1) {
+        throw new IllegalArgumentException();
+      }
       this.subs = subs;
       queue =
-          new PriorityQueue<T>(maxCount) {
+          new PriorityQueue<T>(maxCount - 1) {
             @Override
             protected boolean lessThan(Sub a, Sub b) {
               assert a.mappedDocID != b.mappedDocID;
@@ -144,59 +157,37 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
     public void reset() throws IOException {
       // caller may not have fully consumed the queue:
       queue.clear();
+      current = null;
       boolean first = true;
       for (T sub : subs) {
         if (first) {
           // by setting mappedDocID = -1, this entry is guaranteed to be the top of the queue
           // so the first call to next() will advance it
           sub.mappedDocID = -1;
+          current = sub;
           first = false;
-        } else {
-          int mappedDocID;
-          while (true) {
-            int docID = sub.nextDoc();
-            if (docID == NO_MORE_DOCS) {
-              mappedDocID = NO_MORE_DOCS;
-              break;
-            }
-            mappedDocID = sub.docMap.get(docID);
-            if (mappedDocID != -1) {
-              break;
-            }
-          }
-          if (mappedDocID == NO_MORE_DOCS) {
-            // all docs in this sub were deleted; do not add it to the queue!
-            continue;
-          }
-          sub.mappedDocID = mappedDocID;
-        }
-        queue.add(sub);
+        } else if (sub.nextMappedDoc() != NO_MORE_DOCS) {
+          queue.add(sub);
+        } // else all docs in this sub were deleted; do not add it to the queue!
       }
     }
 
     @Override
     public T next() throws IOException {
-      T top = queue.top();
-
-      while (true) {
-        int docID = top.nextDoc();
-        if (docID == NO_MORE_DOCS) {
-          queue.pop();
-          top = queue.top();
-          break;
-        }
-        int mappedDocID = top.docMap.get(docID);
-        if (mappedDocID == -1) {
-          // doc was deleted
-          continue;
+      int nextDoc = current.nextMappedDoc();
+      if (nextDoc == NO_MORE_DOCS) {
+        if (queue.size() == 0) {
+          current = null;
         } else {
-          top.mappedDocID = mappedDocID;
-          top = queue.updateTop();
-          break;
+          current = queue.pop();
         }
+      } else if (queue.size() > 0 && nextDoc > queue.top().mappedDocID) {
+        T newCurrent = queue.top();
+        queue.updateTop(current);
+        current = newCurrent;
       }
 
-      return top;
+      return current;
     }
   }
 }