You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2020/09/17 17:11:48 UTC

[lucene-solr] 01/02: Further tune Lucene87StoredFieldsFormat for small documents. (#1888)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 54f1ddf39741231c85b6c8b733b9208468f0a703
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Sep 17 18:30:57 2020 +0200

    Further tune Lucene87StoredFieldsFormat for small documents. (#1888)
    
    The increase of the maximum number of chunks per doc done in previous
    issues was mostly random. I'd like to provide users with a similar
    trade-off with what the old versions of BEST_SPEED and BEST_COMPRESSION
    used to do. So since BEST_SPEED used to compress at most 128 docs at
    once, I think we should roughly make it 128*10 now since there are 10
    sub blocks. I made it 1024 to account for the fact that there is a preset
    dict as well that need decompressing. And similarly BEST_COMPRESSION used
    to allow 4x more docs than BEST_SPEED, so I made it 4096.
    
    With such larger numbers of docs per chunk, the decoding of metadata
    became a bottleneck for stored field access so I made it a bit faster by
    doing bulk decoding of the packed longs.
---
 .../compressing/CompressingStoredFieldsReader.java | 36 ++++++++++++----------
 .../lucene87/Lucene87StoredFieldsFormat.java       |  4 +--
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
index 2ced3de..0903100 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@@ -69,7 +69,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BitUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LongsRef;
 import org.apache.lucene.util.packed.PackedInts;
 
 /**
@@ -402,8 +402,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
     // whether the block has been sliced, this happens for large documents
     private boolean sliced;
 
-    private int[] offsets = IntsRef.EMPTY_INTS;
-    private int[] numStoredFields = IntsRef.EMPTY_INTS;
+    private long[] offsets = LongsRef.EMPTY_LONGS;
+    private long[] numStoredFields = LongsRef.EMPTY_LONGS;
 
     // the start pointer at which you can read the compressed documents
     private long startPointer;
@@ -472,9 +472,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
         } else if (bitsPerStoredFields > 31) {
           throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields, fieldsStream);
         } else {
-          final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);
-          for (int i = 0; i < chunkDocs; ++i) {
-            numStoredFields[i] = (int) it.next();
+          final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1024);
+          for (int i = 0; i < chunkDocs; ) {
+            final LongsRef next = it.next(Integer.MAX_VALUE);
+            System.arraycopy(next.longs, next.offset, numStoredFields, i, next.length);
+            i += next.length;
           }
         }
 
@@ -489,9 +491,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
         } else if (bitsPerStoredFields > 31) {
           throw new CorruptIndexException("bitsPerLength=" + bitsPerLength, fieldsStream);
         } else {
-          final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
-          for (int i = 0; i < chunkDocs; ++i) {
-            offsets[i + 1] = (int) it.next();
+          final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1024);
+          for (int i = 0; i < chunkDocs; ) {
+            final LongsRef next = it.next(Integer.MAX_VALUE);
+            System.arraycopy(next.longs, next.offset, offsets, i + 1, next.length);
+            i += next.length;
           }
           for (int i = 0; i < chunkDocs; ++i) {
             offsets[i + 1] += offsets[i];
@@ -500,8 +504,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
 
         // Additional validation: only the empty document has a serialized length of 0
         for (int i = 0; i < chunkDocs; ++i) {
-          final int len = offsets[i + 1] - offsets[i];
-          final int storedFields = numStoredFields[i];
+          final long len = offsets[i + 1] - offsets[i];
+          final long storedFields = numStoredFields[i];
           if ((len == 0) != (storedFields == 0)) {
             throw new CorruptIndexException("length=" + len + ", numStoredFields=" + storedFields, fieldsStream);
           }
@@ -512,7 +516,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
       startPointer = fieldsStream.getFilePointer();
 
       if (merging) {
-        final int totalLength = offsets[chunkDocs];
+        final int totalLength = Math.toIntExact(offsets[chunkDocs]);
         // decompress eagerly
         if (sliced) {
           bytes.offset = bytes.length = 0;
@@ -543,10 +547,10 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
       }
 
       final int index = docID - docBase;
-      final int offset = offsets[index];
-      final int length = offsets[index+1] - offset;
-      final int totalLength = offsets[chunkDocs];
-      final int numStoredFields = this.numStoredFields[index];
+      final int offset = Math.toIntExact(offsets[index]);
+      final int length = Math.toIntExact(offsets[index+1]) - offset;
+      final int totalLength = Math.toIntExact(offsets[chunkDocs]);
+      final int numStoredFields = Math.toIntExact(this.numStoredFields[index]);
 
       final BytesRef bytes;
       if (merging) {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java
index 0e9c9f5..1432df8 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java
@@ -144,9 +144,9 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat {
   StoredFieldsFormat impl(Mode mode) {
     switch (mode) {
       case BEST_SPEED:
-        return new CompressingStoredFieldsFormat("Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 512, 10);
+        return new CompressingStoredFieldsFormat("Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 1024, 10);
       case BEST_COMPRESSION:
-        return new CompressingStoredFieldsFormat("Lucene87StoredFieldsHighData", BEST_COMPRESSION_MODE, BEST_COMPRESSION_BLOCK_LENGTH, 512, 10);
+        return new CompressingStoredFieldsFormat("Lucene87StoredFieldsHighData", BEST_COMPRESSION_MODE, BEST_COMPRESSION_BLOCK_LENGTH, 4096, 10);
       default: throw new AssertionError();
     }
   }