You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2023/01/12 17:39:43 UTC
[lucene] 02/02: Speed up DocIdMerger on sorted indexes. (#12081)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git

commit 5f0d22073446d174a37f80a6d2265b36f4bf247c
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Jan 12 18:27:45 2023 +0100

    Speed up DocIdMerger on sorted indexes. (#12081)
    
    In the case when an index is sorted on a low-cardinality field, or the index
    sort order correlates with the order in which documents get ingested, we can
    optimize `SortedDocIDMerger` by doing a single comparison with the doc ID on
    the next sub. This checks covers at the same time whether the priority queue
    needs reordering and whether the current sub reached `NO_MORE_DOCS`.
---
 lucene/CHANGES.txt                                  |  2 ++
 .../java/org/apache/lucene/index/DocIDMerger.java   | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 97f1e833019..5dd8547bf58 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -184,6 +184,8 @@ Optimizations
 
 * GITHUB#12079: Faster merging of 1D points. (Adrien Grand)
 
+* GITHUB#12081: Small merging speedup on sorted indexes. (Adrien Grand)
+
 
 Other
 ---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
index f9accb710f1..dd1840ca850 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
@@ -137,6 +137,7 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
     private final List<T> subs;
     private T current;
     private final PriorityQueue<T> queue;
+    private int queueMinDocID;
 
     private SortedDocIDMerger(List<T> subs, int maxCount) throws IOException {
       if (maxCount <= 1) {
@@ -154,6 +155,14 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
       reset();
     }
 
+    private void setQueueMinDocID() {
+      if (queue.size() > 0) {
+        queueMinDocID = queue.top().mappedDocID;
+      } else {
+        queueMinDocID = DocIdSetIterator.NO_MORE_DOCS;
+      }
+    }
+
     @Override
     public void reset() throws IOException {
       // caller may not have fully consumed the queue:
@@ -171,23 +180,33 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
           queue.add(sub);
         } // else all docs in this sub were deleted; do not add it to the queue!
       }
+      setQueueMinDocID();
     }
 
     @Override
     public T next() throws IOException {
       int nextDoc = current.nextMappedDoc();
+      if (nextDoc < queueMinDocID) {
+        // This should be the common case when index sorting is either disabled, or enabled on a
+        // low-cardinality field, or enabled on a field that correlates with index order.
+        return current;
+      }
+
       if (nextDoc == NO_MORE_DOCS) {
         if (queue.size() == 0) {
           current = null;
         } else {
           current = queue.pop();
         }
-      } else if (queue.size() > 0 && nextDoc > queue.top().mappedDocID) {
+      } else if (queue.size() > 0) {
+        assert queueMinDocID == queue.top().mappedDocID;
+        assert nextDoc > queueMinDocID;
         T newCurrent = queue.top();
         queue.updateTop(current);
         current = newCurrent;
       }
 
+      setQueueMinDocID();
       return current;
     }
   }