You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/07/29 06:46:14 UTC
[lucene] branch main updated: LUCENE-10031: Speed up
SortedDocIdMerger on low-cardinality sort fields. (#221)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 0e6c314 LUCENE-10031: Speed up SortedDocIdMerger on low-cardinality sort fields. (#221)
0e6c314 is described below
commit 0e6c3146d7853d27037213dc58eddc16a0e05daa
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Jul 29 08:46:10 2021 +0200
LUCENE-10031: Speed up SortedDocIdMerger on low-cardinality sort fields. (#221)
When sorting by low-cardinality fields, the same sub remains current for long
sequences of doc IDs. This speeds up SortedDocIdMerger a bit by extracting
the sub that leads iteration.
---
lucene/CHANGES.txt | 3 +
.../java/org/apache/lucene/index/DocIDMerger.java | 99 ++++++++++------------
2 files changed, 48 insertions(+), 54 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3c8120e..3405d83 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -402,6 +402,9 @@ Optimizations
* LUCENE-10022: Rewrite empty DisjunctionMaxQuery to MatchNoDocsQuery.
(David Harsha via Julie Tibshirani)
+* LUCENE-10031: Slightly faster segment merging for sorted indices.
+ (Adrien Grand)
+
Bug Fixes
---------------------
* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
index 22802ad..ef8b213 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocIDMerger.java
@@ -48,6 +48,24 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
* when done
*/
public abstract int nextDoc() throws IOException;
+
+ /**
+ * Like {@link #nextDoc()} but skips over unmapped docs and returns the next mapped doc ID, or
+ * {@link DocIdSetIterator#NO_MORE_DOCS} when exhausted. This method sets {@link #mappedDocID}
+ * as a side effect.
+ */
+ public final int nextMappedDoc() throws IOException {
+ while (true) {
+ int doc = nextDoc();
+ if (doc == NO_MORE_DOCS) {
+ return this.mappedDocID = NO_MORE_DOCS;
+ }
+ int mappedDoc = docMap.get(doc);
+ if (mappedDoc != -1) {
+ return this.mappedDocID = mappedDoc;
+ }
+ }
+ }
}
/** Construct this from the provided subs, specifying the maximum sub count */
@@ -101,36 +119,31 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
@Override
public T next() throws IOException {
- while (true) {
- int docID = current.nextDoc();
- if (docID == NO_MORE_DOCS) {
- if (nextIndex == subs.size()) {
- current = null;
- return null;
- }
- current = subs.get(nextIndex);
- nextIndex++;
- continue;
- }
-
- int mappedDocID = current.docMap.get(docID);
- if (mappedDocID != -1) {
- current.mappedDocID = mappedDocID;
- return current;
+ while (current.nextMappedDoc() == NO_MORE_DOCS) {
+ if (nextIndex == subs.size()) {
+ current = null;
+ return null;
}
+ current = subs.get(nextIndex);
+ nextIndex++;
}
+ return current;
}
}
private static class SortedDocIDMerger<T extends DocIDMerger.Sub> extends DocIDMerger<T> {
private final List<T> subs;
+ private T current;
private final PriorityQueue<T> queue;
private SortedDocIDMerger(List<T> subs, int maxCount) throws IOException {
+ if (maxCount <= 1) {
+ throw new IllegalArgumentException();
+ }
this.subs = subs;
queue =
- new PriorityQueue<T>(maxCount) {
+ new PriorityQueue<T>(maxCount - 1) {
@Override
protected boolean lessThan(Sub a, Sub b) {
assert a.mappedDocID != b.mappedDocID;
@@ -144,59 +157,37 @@ public abstract class DocIDMerger<T extends DocIDMerger.Sub> {
public void reset() throws IOException {
// caller may not have fully consumed the queue:
queue.clear();
+ current = null;
boolean first = true;
for (T sub : subs) {
if (first) {
// by setting mappedDocID = -1, this entry is guaranteed to be the top of the queue
// so the first call to next() will advance it
sub.mappedDocID = -1;
+ current = sub;
first = false;
- } else {
- int mappedDocID;
- while (true) {
- int docID = sub.nextDoc();
- if (docID == NO_MORE_DOCS) {
- mappedDocID = NO_MORE_DOCS;
- break;
- }
- mappedDocID = sub.docMap.get(docID);
- if (mappedDocID != -1) {
- break;
- }
- }
- if (mappedDocID == NO_MORE_DOCS) {
- // all docs in this sub were deleted; do not add it to the queue!
- continue;
- }
- sub.mappedDocID = mappedDocID;
- }
- queue.add(sub);
+ } else if (sub.nextMappedDoc() != NO_MORE_DOCS) {
+ queue.add(sub);
+ } // else all docs in this sub were deleted; do not add it to the queue!
}
}
@Override
public T next() throws IOException {
- T top = queue.top();
-
- while (true) {
- int docID = top.nextDoc();
- if (docID == NO_MORE_DOCS) {
- queue.pop();
- top = queue.top();
- break;
- }
- int mappedDocID = top.docMap.get(docID);
- if (mappedDocID == -1) {
- // doc was deleted
- continue;
+ int nextDoc = current.nextMappedDoc();
+ if (nextDoc == NO_MORE_DOCS) {
+ if (queue.size() == 0) {
+ current = null;
} else {
- top.mappedDocID = mappedDocID;
- top = queue.updateTop();
- break;
+ current = queue.pop();
}
+ } else if (queue.size() > 0 && nextDoc > queue.top().mappedDocID) {
+ T newCurrent = queue.top();
+ queue.updateTop(current);
+ current = newCurrent;
}
- return top;
+ return current;
}
}
}