You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2020/09/17 17:11:48 UTC
[lucene-solr] 01/02: Further tune Lucene87StoredFieldsFormat for
small documents. (#1888)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 54f1ddf39741231c85b6c8b733b9208468f0a703
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Sep 17 18:30:57 2020 +0200
Further tune Lucene87StoredFieldsFormat for small documents. (#1888)
The increase of the maximum number of chunks per doc done in previous
issues was mostly random. I'd like to provide users with a similar
trade-off with what the old versions of BEST_SPEED and BEST_COMPRESSION
used to do. So since BEST_SPEED used to compress at most 128 docs at
once, I think we should roughly make it 128*10 now since there are 10
sub blocks. I made it 1024 to account for the fact that there is a preset
dict as well that need decompressing. And similarly BEST_COMPRESSION used
to allow 4x more docs than BEST_SPEED, so I made it 4096.
With such larger numbers of docs per chunk, the decoding of metadata
became a bottleneck for stored field access so I made it a bit faster by
doing bulk decoding of the packed longs.
---
.../compressing/CompressingStoredFieldsReader.java | 36 ++++++++++++----------
.../lucene87/Lucene87StoredFieldsFormat.java | 4 +--
2 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
index 2ced3de..0903100 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@@ -69,7 +69,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -402,8 +402,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
// whether the block has been sliced, this happens for large documents
private boolean sliced;
- private int[] offsets = IntsRef.EMPTY_INTS;
- private int[] numStoredFields = IntsRef.EMPTY_INTS;
+ private long[] offsets = LongsRef.EMPTY_LONGS;
+ private long[] numStoredFields = LongsRef.EMPTY_LONGS;
// the start pointer at which you can read the compressed documents
private long startPointer;
@@ -472,9 +472,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
} else if (bitsPerStoredFields > 31) {
throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields, fieldsStream);
} else {
- final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);
- for (int i = 0; i < chunkDocs; ++i) {
- numStoredFields[i] = (int) it.next();
+ final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1024);
+ for (int i = 0; i < chunkDocs; ) {
+ final LongsRef next = it.next(Integer.MAX_VALUE);
+ System.arraycopy(next.longs, next.offset, numStoredFields, i, next.length);
+ i += next.length;
}
}
@@ -489,9 +491,11 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
} else if (bitsPerStoredFields > 31) {
throw new CorruptIndexException("bitsPerLength=" + bitsPerLength, fieldsStream);
} else {
- final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
- for (int i = 0; i < chunkDocs; ++i) {
- offsets[i + 1] = (int) it.next();
+ final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1024);
+ for (int i = 0; i < chunkDocs; ) {
+ final LongsRef next = it.next(Integer.MAX_VALUE);
+ System.arraycopy(next.longs, next.offset, offsets, i + 1, next.length);
+ i += next.length;
}
for (int i = 0; i < chunkDocs; ++i) {
offsets[i + 1] += offsets[i];
@@ -500,8 +504,8 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
// Additional validation: only the empty document has a serialized length of 0
for (int i = 0; i < chunkDocs; ++i) {
- final int len = offsets[i + 1] - offsets[i];
- final int storedFields = numStoredFields[i];
+ final long len = offsets[i + 1] - offsets[i];
+ final long storedFields = numStoredFields[i];
if ((len == 0) != (storedFields == 0)) {
throw new CorruptIndexException("length=" + len + ", numStoredFields=" + storedFields, fieldsStream);
}
@@ -512,7 +516,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
startPointer = fieldsStream.getFilePointer();
if (merging) {
- final int totalLength = offsets[chunkDocs];
+ final int totalLength = Math.toIntExact(offsets[chunkDocs]);
// decompress eagerly
if (sliced) {
bytes.offset = bytes.length = 0;
@@ -543,10 +547,10 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
}
final int index = docID - docBase;
- final int offset = offsets[index];
- final int length = offsets[index+1] - offset;
- final int totalLength = offsets[chunkDocs];
- final int numStoredFields = this.numStoredFields[index];
+ final int offset = Math.toIntExact(offsets[index]);
+ final int length = Math.toIntExact(offsets[index+1]) - offset;
+ final int totalLength = Math.toIntExact(offsets[chunkDocs]);
+ final int numStoredFields = Math.toIntExact(this.numStoredFields[index]);
final BytesRef bytes;
if (merging) {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java
index 0e9c9f5..1432df8 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87StoredFieldsFormat.java
@@ -144,9 +144,9 @@ public class Lucene87StoredFieldsFormat extends StoredFieldsFormat {
StoredFieldsFormat impl(Mode mode) {
switch (mode) {
case BEST_SPEED:
- return new CompressingStoredFieldsFormat("Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 512, 10);
+ return new CompressingStoredFieldsFormat("Lucene87StoredFieldsFastData", BEST_SPEED_MODE, BEST_SPEED_BLOCK_LENGTH, 1024, 10);
case BEST_COMPRESSION:
- return new CompressingStoredFieldsFormat("Lucene87StoredFieldsHighData", BEST_COMPRESSION_MODE, BEST_COMPRESSION_BLOCK_LENGTH, 512, 10);
+ return new CompressingStoredFieldsFormat("Lucene87StoredFieldsHighData", BEST_COMPRESSION_MODE, BEST_COMPRESSION_BLOCK_LENGTH, 4096, 10);
default: throw new AssertionError();
}
}