You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2021/12/07 06:27:27 UTC

[lucene] branch branch_9x updated: LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)

This is an automated email from the ASF dual-hosted git repository.

ivera pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 892e324  LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)
892e324 is described below

commit 892e324d027d30699504baf59ac473135300df52
Author: gf2121 <52...@users.noreply.github.com>
AuthorDate: Tue Dec 7 14:26:03 2021 +0800

    LUCENE-10280: Store BKD blocks with continuous ids more efficiently (#510)
---
 lucene/CHANGES.txt                                 |  2 +
 .../apache/lucene/util/DocBaseBitSetIterator.java  |  2 +-
 .../org/apache/lucene/util/bkd/DocIdsWriter.java   | 47 ++++++++++++++++++----
 .../apache/lucene/util/bkd/TestDocIdsWriter.java   | 15 +++++++
 4 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b031e01..6812d2d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -52,6 +52,8 @@ Improvements
 Optimizations
 ---------------------
 
+* LUCENE-10280: Optimize BKD leaves' doc IDs codec when they are continuous. (Guo Feng)
+
 * LUCENE-10233: Store BKD leaves' doc IDs as bitset in some cases (typically for low cardinality fields
  or sorted indices) to speed up addAll. (Guo Feng, Adrien Grand)
 
diff --git a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java
index ba05f94..841149f 100644
--- a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java
+++ b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java
@@ -35,7 +35,7 @@ public class DocBaseBitSetIterator extends DocIdSetIterator {
       throw new IllegalArgumentException("cost must be >= 0, got " + cost);
     }
     if ((docBase & 63) != 0) {
-      throw new IllegalArgumentException("docBase need to be a multiple of 64");
+      throw new IllegalArgumentException("docBase need to be a multiple of 64, got " + docBase);
     }
     this.bits = bits;
     this.length = bits.length() + docBase;
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
index def705b..4e27415 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
@@ -44,13 +44,22 @@ class DocIdsWriter {
       }
     }
 
-    if (strictlySorted && (docIds[start + count - 1] - docIds[start] + 1) <= (count << 4)) {
-      // Only trigger this optimization when max - min + 1 <= 16 * count in order to avoid expanding
-      // too much storage.
-      // A field with lower cardinality will have higher probability to trigger this optimization.
-      out.writeByte((byte) -1);
-      writeIdsAsBitSet(docIds, start, count, out);
-      return;
+    int min2max = docIds[start + count - 1] - docIds[start] + 1;
+    if (strictlySorted) {
+      if (min2max == count) {
+        // continuous ids, typically happens when segment is sorted
+        out.writeByte((byte) -2);
+        out.writeVInt(docIds[start]);
+        return;
+      } else if (min2max <= (count << 4)) {
+        assert min2max > count : "min2max: " + min2max + ", count: " + count;
+        // Only trigger bitset optimization when max - min + 1 <= 16 * count in order to avoid
+        // expanding too much storage.
+        // A field with lower cardinality will have higher probability to trigger this optimization.
+        out.writeByte((byte) -1);
+        writeIdsAsBitSet(docIds, start, count, out);
+        return;
+      }
     }
     if (sorted) {
       out.writeByte((byte) 0);
@@ -139,6 +148,9 @@ class DocIdsWriter {
   static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
     final int bpv = in.readByte();
     switch (bpv) {
+      case -2:
+        readContinuousIds(in, count, docIDs);
+        break;
       case -1:
         readBitSet(in, count, docIDs);
         break;
@@ -165,6 +177,13 @@ class DocIdsWriter {
     return new DocBaseBitSetIterator(bitSet, count, offsetWords << 6);
   }
 
+  private static void readContinuousIds(IndexInput in, int count, int[] docIDs) throws IOException {
+    int start = in.readVInt();
+    for (int i = 0; i < count; i++) {
+      docIDs[i] = start + i;
+    }
+  }
+
   private static void readBitSet(IndexInput in, int count, int[] docIDs) throws IOException {
     DocIdSetIterator iterator = readBitSetIterator(in, count);
     int docId, pos = 0;
@@ -215,6 +234,9 @@ class DocIdsWriter {
   static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
     final int bpv = in.readByte();
     switch (bpv) {
+      case -2:
+        readContinuousIds(in, count, visitor);
+        break;
       case -1:
         readBitSet(in, count, visitor);
         break;
@@ -274,4 +296,15 @@ class DocIdsWriter {
     DocIdSetIterator bitSetIterator = readBitSetIterator(in, count);
     visitor.visit(bitSetIterator);
   }
+
+  private static void readContinuousIds(IndexInput in, int count, IntersectVisitor visitor)
+      throws IOException {
+    int start = in.readVInt();
+    int extra = start & 63;
+    int offset = start - extra;
+    int numBits = count + extra;
+    FixedBitSet bitSet = new FixedBitSet(numBits);
+    bitSet.set(extra, numBits);
+    visitor.visit(new DocBaseBitSetIterator(bitSet, count, offset));
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
index d2115c8..80d0d9f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
@@ -76,6 +76,21 @@ public class TestDocIdsWriter extends LuceneTestCase {
     }
   }
 
+  public void testContinuousIds() throws Exception {
+    int numIters = atLeast(100);
+    try (Directory dir = newDirectory()) {
+      for (int iter = 0; iter < numIters; ++iter) {
+        int size = 1 + random().nextInt(5000);
+        int[] docIDs = new int[size];
+        int start = random().nextInt(1000000);
+        for (int i = 0; i < docIDs.length; i++) {
+          docIDs[i] = start + i;
+        }
+        test(dir, docIDs);
+      }
+    }
+  }
+
   private void test(Directory dir, int[] ints) throws Exception {
     final long len;
     try (IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {