You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2021/12/07 06:27:27 UTC
[lucene] branch branch_9x updated: LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)
This is an automated email from the ASF dual-hosted git repository.
ivera pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 892e324 LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)
892e324 is described below
commit 892e324d027d30699504baf59ac473135300df52
Author: gf2121 <52...@users.noreply.github.com>
AuthorDate: Tue Dec 7 14:26:03 2021 +0800
LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)
---
lucene/CHANGES.txt | 2 +
.../apache/lucene/util/DocBaseBitSetIterator.java | 2 +-
.../org/apache/lucene/util/bkd/DocIdsWriter.java | 47 ++++++++++++++++++----
.../apache/lucene/util/bkd/TestDocIdsWriter.java | 15 +++++++
4 files changed, 58 insertions(+), 8 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b031e01..6812d2d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -52,6 +52,8 @@ Improvements
Optimizations
---------------------
+* LUCENE-10280: Optimize BKD leaves' doc IDs codec when they are continuous. (Guo Feng)
+
* LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields
or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand)
diff --git a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java
index ba05f94..841149f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java
@@ -35,7 +35,7 @@ public class DocBaseBitSetIterator extends DocIdSetIterator {
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
}
if ((docBase & 63) != 0) {
- throw new IllegalArgumentException("docBase need to be a multiple of 64");
+ throw new IllegalArgumentException("docBase need to be a multiple of 64, got " + docBase);
}
this.bits = bits;
this.length = bits.length() + docBase;
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
index def705b..4e27415 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
@@ -44,13 +44,22 @@ class DocIdsWriter {
}
}
- if (strictlySorted && (docIds[start + count - 1] - docIds[start] + 1) <= (count << 4)) {
- // Only trigger this optimization when max - min + 1 <= 16 * count in order to avoid expanding
- // too much storage.
- // A field with lower cardinality will have higher probability to trigger this optimization.
- out.writeByte((byte) -1);
- writeIdsAsBitSet(docIds, start, count, out);
- return;
+ int min2max = docIds[start + count - 1] - docIds[start] + 1;
+ if (strictlySorted) {
+ if (min2max == count) {
+ // continuous ids, typically happens when segment is sorted
+ out.writeByte((byte) -2);
+ out.writeVInt(docIds[start]);
+ return;
+ } else if (min2max <= (count << 4)) {
+ assert min2max > count : "min2max: " + min2max + ", count: " + count;
+ // Only trigger bitset optimization when max - min + 1 <= 16 * count in order to avoid
+ // expanding too much storage.
+ // A field with lower cardinality will have higher probability to trigger this optimization.
+ out.writeByte((byte) -1);
+ writeIdsAsBitSet(docIds, start, count, out);
+ return;
+ }
}
if (sorted) {
out.writeByte((byte) 0);
@@ -139,6 +148,9 @@ class DocIdsWriter {
static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
final int bpv = in.readByte();
switch (bpv) {
+ case -2:
+ readContinuousIds(in, count, docIDs);
+ break;
case -1:
readBitSet(in, count, docIDs);
break;
@@ -165,6 +177,13 @@ class DocIdsWriter {
return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6);
}
+ private static void readContinuousIds(IndexInput in, int count, int[] docIDs) throws IOException {
+ int start = in.readVInt();
+ for (int i = 0; i < count; i++) {
+ docIDs[i] = start + i;
+ }
+ }
+
private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException {
DocIdSetIterator iterator = readBitSetIterator(in, count);
int docId, pos = 0;
@@ -215,6 +234,9 @@ class DocIdsWriter {
static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
final int bpv = in.readByte();
switch (bpv) {
+ case -2:
+ readContinuousIds(in, count, visitor);
+ break;
case -1:
readBitSet(in, count, visitor);
break;
@@ -274,4 +296,15 @@ class DocIdsWriter {
DocIdSetIterator bitSetIterator = readBitSetIterator(in, count);
visitor.visit(bitSetIterator);
}
+
+ private static void readContinuousIds(IndexInput in, int count, IntersectVisitor visitor)
+ throws IOException {
+ int start = in.readVInt();
+ int extra = start & 63;
+ int offset = start - extra;
+ int numBits = count + extra;
+ FixedBitSet bitSet = new FixedBitSet(numBits);
+ bitSet.set(extra, numBits);
+ visitor.visit(new DocBaseBitSetIterator(bitSet, count, offset));
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
index d2115c8..80d0d9f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
@@ -76,6 +76,21 @@ public class TestDocIdsWriter extends LuceneTestCase {
}
}
+ public void testContinuousIds() throws Exception {
+ int numIters = atLeast(100);
+ try (Directory dir = newDirectory()) {
+ for (int iter = 0; iter < numIters; ++iter) {
+ int size = 1 + random().nextInt(5000);
+ int[] docIDs = new int[size];
+ int start = random().nextInt(1000000);
+ for (int i = 0; i < docIDs.length; i++) {
+ docIDs[i] = start + i;
+ }
+ test(dir, docIDs);
+ }
+ }
+ }
+
private void test(Directory dir, int[] ints) throws Exception {
final long len;
try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {