You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dn...@apache.org on 2021/05/16 20:47:42 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9935: Enable bulk
merge for stored fields with index sort
This is an automated email from the ASF dual-hosted git repository.
dnhatn pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new efdc43f LUCENE-9935: Enable bulk merge for stored fields with index sort
efdc43f is described below
commit efdc43fee18e33c664a22df97ef6cd40912af20d
Author: Nhat Nguyen <nh...@elastic.co>
AuthorDate: Sun May 16 16:47:21 2021 -0400
LUCENE-9935: Enable bulk merge for stored fields with index sort
This commit enables bulk-merges (i.e., raw chunk copying) for stored
fields when index sort is enabled.
---
lucene/CHANGES.txt | 1 +
.../compressing/CompressingStoredFieldsReader.java | 18 +-
.../compressing/CompressingStoredFieldsWriter.java | 354 +++++++++++----------
.../index/BaseStoredFieldsFormatTestCase.java | 129 ++++++++
4 files changed, 332 insertions(+), 170 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 35a794b..d85f088 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -54,6 +54,7 @@ Improvements
* LUCENE-9877: Reduce index size by increasing allowable exceptions in PForUtil from
3 to 7. (Greg Miller)
+* LUCENE-9935: Enable bulk merge for stored fields with index sort. (Robert Muir, Adrien Grand, Nhat Nguyen)
Optimizations
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
index 03aaa39..e39d0b6 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
@@ -37,7 +37,7 @@ import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT;
-import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_NUMCHUNKS;
+import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_NUM_CHUNKS;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_META;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_OFFHEAP_INDEX;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START;
@@ -189,7 +189,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
this.maxPointer = maxPointer;
this.indexReader = indexReader;
- if (version >= VERSION_NUMCHUNKS) {
+ if (version >= VERSION_NUM_CHUNKS) {
numChunks = metaIn.readVLong();
numDirtyChunks = metaIn.readVLong();
numDirtyDocs = metaIn.readVLong();
@@ -480,7 +480,7 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
private void doReset(int docID) throws IOException {
docBase = fieldsStream.readVInt();
final int token = fieldsStream.readVInt();
- chunkDocs = token >>> 1;
+ chunkDocs = version >= VERSION_NUM_CHUNKS ? token >>> 2 : token >>> 1;
if (contains(docID) == false
|| docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("Corrupted: docID=" + docID
@@ -659,6 +659,18 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
return state.document(docID);
}
+ /** Checks if a given docID was loaded in the current block state. */
+ boolean isLoaded(int docID) {
+ if (merging == false) {
+ throw new IllegalStateException("isLoaded should only ever get called on a merge instance");
+ }
+ if (version != VERSION_CURRENT) {
+ throw new IllegalStateException(
+ "isLoaded should only ever get called when the reader is on the current version");
+ }
+ return state.contains(docID);
+ }
+
@Override
public void visitDocument(int docID, StoredFieldVisitor visitor)
throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
index fbed04d..7520b92 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
@@ -41,7 +41,6 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
-import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
@@ -77,9 +76,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
static final int VERSION_OFFHEAP_INDEX = 2;
/** Version where all metadata were moved to the meta file. */
static final int VERSION_META = 3;
- /** Version where numChunks is explicitly recorded in meta file */
- static final int VERSION_NUMCHUNKS = 4;
- static final int VERSION_CURRENT = VERSION_NUMCHUNKS;
+ /** Version where numChunks is explicitly recorded in meta file and a dirty chunk bit is recorded in each chunk */
+ static final int VERSION_NUM_CHUNKS = 4;
+ static final int VERSION_CURRENT = VERSION_NUM_CHUNKS;
static final int META_VERSION_START = 0;
private final String segment;
@@ -170,7 +169,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
endOffsets[numBufferedDocs] = Math.toIntExact(bufferedDocs.size());
++numBufferedDocs;
if (triggerFlush()) {
- flush();
+ flush(false);
}
}
@@ -205,12 +204,14 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
}
}
- private void writeHeader(int docBase, int numBufferedDocs, int[] numStoredFields, int[] lengths, boolean sliced) throws IOException {
+ private void writeHeader(int docBase, int numBufferedDocs, int[] numStoredFields,
+ int[] lengths, boolean sliced, boolean dirtyChunk) throws IOException {
final int slicedBit = sliced ? 1 : 0;
-
+ final int dirtyBit = dirtyChunk ? 2 : 0;
+
// save docBase and numBufferedDocs
fieldsStream.writeVInt(docBase);
- fieldsStream.writeVInt((numBufferedDocs) << 1 | slicedBit);
+ fieldsStream.writeVInt((numBufferedDocs << 2) | dirtyBit | slicedBit);
// save numStoredFields
saveInts(numStoredFields, numBufferedDocs, fieldsStream);
@@ -224,8 +225,13 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
numBufferedDocs >= maxDocsPerChunk;
}
- private void flush() throws IOException {
+ private void flush(boolean force) throws IOException {
+ assert triggerFlush() != force;
numChunks++;
+ if (force) {
+ numDirtyChunks++; // incomplete: we had to force this flush
+ numDirtyDocs += numBufferedDocs;
+ }
indexWriter.writeIndex(numBufferedDocs, fieldsStream.getFilePointer());
// transform end offsets into lengths
@@ -235,7 +241,8 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
assert lengths[i] >= 0;
}
final boolean sliced = bufferedDocs.size() >= 2 * chunkSize;
- writeHeader(docBase, numBufferedDocs, numStoredFields, lengths, sliced);
+ final boolean dirtyChunk = force;
+ writeHeader(docBase, numBufferedDocs, numStoredFields, lengths, sliced, dirtyChunk);
// compress stored fields to fieldsStream
//
@@ -472,9 +479,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
@Override
public void finish(FieldInfos fis, int numDocs) throws IOException {
if (numBufferedDocs > 0) {
- numDirtyChunks++; // incomplete: we had to force this flush
- numDirtyDocs += numBufferedDocs;
- flush();
+ flush(true);
} else {
assert bufferedDocs.size() == 0;
}
@@ -503,164 +508,143 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
BULK_MERGE_ENABLED = v;
}
- @Override
- public int merge(MergeState mergeState) throws IOException {
- int docCount = 0;
- int numReaders = mergeState.maxDocs.length;
-
- MatchingReaders matching = new MatchingReaders(mergeState);
- if (mergeState.needsIndexSort) {
- /**
- * If all readers are compressed and they have the same fieldinfos then we can merge the serialized document
- * directly.
- */
- List<CompressingStoredFieldsMergeSub> subs = new ArrayList<>();
- for(int i=0;i<mergeState.storedFieldsReaders.length;i++) {
- if (matching.matchingReaders[i] &&
- mergeState.storedFieldsReaders[i] instanceof CompressingStoredFieldsReader) {
- CompressingStoredFieldsReader storedFieldsReader = (CompressingStoredFieldsReader) mergeState.storedFieldsReaders[i];
- storedFieldsReader.checkIntegrity();
- subs.add(new CompressingStoredFieldsMergeSub(storedFieldsReader, mergeState.docMaps[i], mergeState.maxDocs[i]));
- } else {
- return super.merge(mergeState);
- }
- }
+ private void copyOneDoc(CompressingStoredFieldsReader reader, int docID)
+ throws IOException {
+ assert reader.getVersion() == VERSION_CURRENT;
+ SerializedDocument doc = reader.document(docID);
+ startDocument();
+ bufferedDocs.copyBytes(doc.in, doc.length);
+ numStoredFieldsInDoc = doc.numStoredFields;
+ finishDocument();
+ }
- final DocIDMerger<CompressingStoredFieldsMergeSub> docIDMerger =
- DocIDMerger.of(subs, true);
- while (true) {
- CompressingStoredFieldsMergeSub sub = docIDMerger.next();
- if (sub == null) {
- break;
- }
- assert sub.mappedDocID == docCount;
- SerializedDocument doc = sub.reader.document(sub.docID);
- startDocument();
- bufferedDocs.copyBytes(doc.in, doc.length);
- numStoredFieldsInDoc = doc.numStoredFields;
- finishDocument();
- ++docCount;
- }
- finish(mergeState.mergeFieldInfos, docCount);
- return docCount;
+ private void copyChunks(
+ final MergeState mergeState,
+ final CompressingStoredFieldsMergeSub sub,
+ final int fromDocID,
+ final int toDocID)
+ throws IOException {
+ final CompressingStoredFieldsReader reader =
+ (CompressingStoredFieldsReader) mergeState.storedFieldsReaders[sub.readerIndex];
+ assert reader.getVersion() == VERSION_CURRENT;
+ assert reader.getChunkSize() == chunkSize;
+ assert reader.getCompressionMode() == compressionMode;
+ assert !tooDirty(reader);
+ assert mergeState.liveDocs[sub.readerIndex] == null;
+
+ int docID = fromDocID;
+ final FieldsIndex index = reader.getIndexReader();
+
+ // copy docs that belong to the previous chunk
+ while (docID < toDocID && reader.isLoaded(docID)) {
+ copyOneDoc(reader, docID++);
}
-
- for (int readerIndex=0;readerIndex<numReaders;readerIndex++) {
- MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
- CompressingStoredFieldsReader matchingFieldsReader = null;
- if (matching.matchingReaders[readerIndex]) {
- final StoredFieldsReader fieldsReader = mergeState.storedFieldsReaders[readerIndex];
- // we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader
- if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) {
- matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader;
- }
+ if (docID >= toDocID) {
+ return;
+ }
+ // copy chunks
+ long fromPointer = index.getStartPointer(docID);
+ final long toPointer =
+ toDocID == sub.maxDoc ? reader.getMaxPointer() : index.getStartPointer(toDocID);
+ if (fromPointer < toPointer) {
+ if (numBufferedDocs > 0) {
+ flush(true);
}
-
- final int maxDoc = mergeState.maxDocs[readerIndex];
- final Bits liveDocs = mergeState.liveDocs[readerIndex];
-
- // if its some other format, or an older version of this format, or safety switch:
- if (matchingFieldsReader == null || matchingFieldsReader.getVersion() != VERSION_CURRENT || BULK_MERGE_ENABLED == false) {
- // naive merge...
- StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[readerIndex];
- if (storedFieldsReader != null) {
- storedFieldsReader.checkIntegrity();
+ final IndexInput rawDocs = reader.getFieldsStream();
+ rawDocs.seek(fromPointer);
+ do {
+ final int base = rawDocs.readVInt();
+ final int code = rawDocs.readVInt();
+ final int bufferedDocs = code >>> 2;
+ if (base != docID) {
+ throw new CorruptIndexException(
+ "invalid state: base=" + base + ", docID=" + docID, rawDocs);
}
- for (int docID = 0; docID < maxDoc; docID++) {
- if (liveDocs != null && liveDocs.get(docID) == false) {
- continue;
- }
- startDocument();
- storedFieldsReader.visitDocument(docID, visitor);
- finishDocument();
- ++docCount;
+ // write a new index entry and new header for this chunk.
+ indexWriter.writeIndex(bufferedDocs, fieldsStream.getFilePointer());
+ fieldsStream.writeVInt(docBase); // rebase
+ fieldsStream.writeVInt(code);
+ docID += bufferedDocs;
+ docBase += bufferedDocs;
+ if (docID > toDocID) {
+ throw new CorruptIndexException(
+ "invalid state: base=" + base + ", count=" + bufferedDocs + ", toDocID=" + toDocID,
+ rawDocs);
}
- } else if (matchingFieldsReader.getCompressionMode() == compressionMode &&
- matchingFieldsReader.getChunkSize() == chunkSize &&
- matchingFieldsReader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT &&
- liveDocs == null &&
- !tooDirty(matchingFieldsReader)) {
- // optimized merge, raw byte copy
- // its not worth fine-graining this if there are deletions.
-
- // if the format is older, its always handled by the naive merge case above
- assert matchingFieldsReader.getVersion() == VERSION_CURRENT;
- matchingFieldsReader.checkIntegrity();
-
- // flush any pending chunks
- if (numBufferedDocs > 0) {
- numDirtyChunks++; // incomplete: we had to force this flush
- numDirtyDocs += numBufferedDocs;
- flush();
+ // copy bytes until the next chunk boundary (or end of chunk data).
+ // using the stored fields index for this isn't the most efficient, but fast enough
+ // and is a source of redundancy for detecting bad things.
+ final long endChunkPointer;
+ if (docID == sub.maxDoc) {
+ endChunkPointer = reader.getMaxPointer();
+ } else {
+ endChunkPointer = index.getStartPointer(docID);
}
-
- // iterate over each chunk. we use the stored fields index to find chunk boundaries,
- // read the docstart + doccount from the chunk header (we write a new header, since doc numbers will change),
- // and just copy the bytes directly.
- IndexInput rawDocs = matchingFieldsReader.getFieldsStream();
- FieldsIndex index = matchingFieldsReader.getIndexReader();
- rawDocs.seek(index.getStartPointer(0));
- int docID = 0;
- while (docID < maxDoc) {
- // read header
- int base = rawDocs.readVInt();
- if (base != docID) {
- throw new CorruptIndexException("invalid state: base=" + base + ", docID=" + docID, rawDocs);
- }
- int code = rawDocs.readVInt();
-
- // write a new index entry and new header for this chunk.
- int bufferedDocs = code >>> 1;
- indexWriter.writeIndex(bufferedDocs, fieldsStream.getFilePointer());
- fieldsStream.writeVInt(docBase); // rebase
- fieldsStream.writeVInt(code);
- docID += bufferedDocs;
- docBase += bufferedDocs;
- docCount += bufferedDocs;
-
- if (docID > maxDoc) {
- throw new CorruptIndexException("invalid state: base=" + base + ", count=" + bufferedDocs + ", maxDoc=" + maxDoc, rawDocs);
- }
-
- // copy bytes until the next chunk boundary (or end of chunk data).
- // using the stored fields index for this isn't the most efficient, but fast enough
- // and is a source of redundancy for detecting bad things.
- final long end;
- if (docID == maxDoc) {
- end = matchingFieldsReader.getMaxPointer();
- } else {
- end = index.getStartPointer(docID);
- }
- fieldsStream.copyBytes(rawDocs, end - rawDocs.getFilePointer());
+ fieldsStream.copyBytes(rawDocs, endChunkPointer - rawDocs.getFilePointer());
+ ++numChunks;
+ final boolean dirtyChunk = (code & 2) != 0;
+ if (dirtyChunk) {
+ assert bufferedDocs < maxDocsPerChunk;
+ ++numDirtyChunks;
+ numDirtyDocs += bufferedDocs;
}
-
- if (rawDocs.getFilePointer() != matchingFieldsReader.getMaxPointer()) {
- throw new CorruptIndexException("invalid state: pos=" + rawDocs.getFilePointer() + ", max=" + matchingFieldsReader.getMaxPointer(), rawDocs);
+ fromPointer = endChunkPointer;
+ } while (fromPointer < toPointer);
+ }
+
+ // copy leftover docs that don't form a complete chunk
+ assert reader.isLoaded(docID) == false;
+ while (docID < toDocID) {
+ copyOneDoc(reader, docID++);
+ }
+ }
+
+ @Override
+ public int merge(MergeState mergeState) throws IOException {
+ final MatchingReaders matchingReaders = new MatchingReaders(mergeState);
+ final MergeVisitor[] visitors = new MergeVisitor[mergeState.storedFieldsReaders.length];
+ final List<CompressingStoredFieldsMergeSub> subs =
+ new ArrayList<>(mergeState.storedFieldsReaders.length);
+ for (int i = 0; i < mergeState.storedFieldsReaders.length; i++) {
+ final StoredFieldsReader reader = mergeState.storedFieldsReaders[i];
+ reader.checkIntegrity();
+ MergeStrategy mergeStrategy = getMergeStrategy(mergeState, matchingReaders, i);
+ if (mergeStrategy == MergeStrategy.VISITOR) {
+ visitors[i] = new MergeVisitor(mergeState, i);
+ }
+ subs.add(new CompressingStoredFieldsMergeSub(mergeState, mergeStrategy, i));
+ }
+ int docCount = 0;
+ final DocIDMerger<CompressingStoredFieldsMergeSub> docIDMerger =
+ DocIDMerger.of(subs, mergeState.needsIndexSort);
+ CompressingStoredFieldsMergeSub sub = docIDMerger.next();
+ while (sub != null) {
+ assert sub.mappedDocID == docCount : sub.mappedDocID + " != " + docCount;
+ final StoredFieldsReader reader = mergeState.storedFieldsReaders[sub.readerIndex];
+ if (sub.mergeStrategy == MergeStrategy.BULK) {
+ final int fromDocID = sub.docID;
+ int toDocID = fromDocID;
+ final CompressingStoredFieldsMergeSub current = sub;
+ while ((sub = docIDMerger.next()) == current) {
+ ++toDocID;
+ assert sub.docID == toDocID;
}
-
- // since we bulk merged all chunks, we inherit any dirty ones from this segment.
- numChunks += matchingFieldsReader.getNumChunks();
- numDirtyChunks += matchingFieldsReader.getNumDirtyChunks();
- numDirtyDocs += matchingFieldsReader.getNumDirtyDocs();
+ ++toDocID; // exclusive bound
+ copyChunks(mergeState, current, fromDocID, toDocID);
+ docCount += (toDocID - fromDocID);
+ } else if (sub.mergeStrategy == MergeStrategy.DOC) {
+ copyOneDoc((CompressingStoredFieldsReader) reader, sub.docID);
+ ++docCount;
+ sub = docIDMerger.next();
+ } else if (sub.mergeStrategy == MergeStrategy.VISITOR) {
+ assert visitors[sub.readerIndex] != null;
+ startDocument();
+ reader.visitDocument(sub.docID, visitors[sub.readerIndex]);
+ finishDocument();
+ ++docCount;
+ sub = docIDMerger.next();
} else {
- // optimized merge, we copy serialized (but decompressed) bytes directly
- // even on simple docs (1 stored field), it seems to help by about 20%
-
- // if the format is older, its always handled by the naive merge case above
- assert matchingFieldsReader.getVersion() == VERSION_CURRENT;
- matchingFieldsReader.checkIntegrity();
-
- for (int docID = 0; docID < maxDoc; docID++) {
- if (liveDocs != null && liveDocs.get(docID) == false) {
- continue;
- }
- SerializedDocument doc = matchingFieldsReader.document(docID);
- startDocument();
- bufferedDocs.copyBytes(doc.in, doc.length);
- numStoredFieldsInDoc = doc.numStoredFields;
- finishDocument();
- ++docCount;
- }
+ throw new AssertionError("Unknown merge strategy [" + sub.mergeStrategy + "]");
}
}
finish(mergeState.mergeFieldInfos, docCount);
@@ -681,15 +665,51 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
&& candidate.getNumDirtyChunks() * 100 > candidate.getNumChunks();
}
+ private enum MergeStrategy {
+ /** Copy chunk by chunk in a compressed format */
+ BULK,
+
+ /** Copy document by document in a decompressed format */
+ DOC,
+
+ /** Copy field by field of decompressed documents */
+ VISITOR
+ }
+
+ private MergeStrategy getMergeStrategy(
+ MergeState mergeState, MatchingReaders matchingReaders, int readerIndex) {
+ final StoredFieldsReader candidate = mergeState.storedFieldsReaders[readerIndex];
+ if (matchingReaders.matchingReaders[readerIndex] == false
+ || candidate instanceof CompressingStoredFieldsReader == false
+ || ((CompressingStoredFieldsReader) candidate).getVersion() != VERSION_CURRENT) {
+ return MergeStrategy.VISITOR;
+ }
+ CompressingStoredFieldsReader reader = (CompressingStoredFieldsReader) candidate;
+ if (BULK_MERGE_ENABLED
+ && reader.getCompressionMode() == compressionMode
+ && reader.getChunkSize() == chunkSize
+ && reader.getPackedIntsVersion() == PackedInts.VERSION_CURRENT
+ // its not worth fine-graining this if there are deletions.
+ && mergeState.liveDocs[readerIndex] == null
+ && !tooDirty(reader)) {
+ return MergeStrategy.BULK;
+ } else {
+ return MergeStrategy.DOC;
+ }
+ }
+
private static class CompressingStoredFieldsMergeSub extends DocIDMerger.Sub {
- private final CompressingStoredFieldsReader reader;
+ private final int readerIndex;
private final int maxDoc;
+ private final MergeStrategy mergeStrategy;
int docID = -1;
- CompressingStoredFieldsMergeSub(CompressingStoredFieldsReader reader, MergeState.DocMap docMap, int maxDoc) {
- super(docMap);
- this.maxDoc = maxDoc;
- this.reader = reader;
+ CompressingStoredFieldsMergeSub(
+ MergeState mergeState, MergeStrategy mergeStrategy, int readerIndex) {
+ super(mergeState.docMaps[readerIndex]);
+ this.readerIndex = readerIndex;
+ this.mergeStrategy = mergeStrategy;
+ this.maxDoc = mergeState.maxDocs[readerIndex];
}
@Override
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java
index 0c975cf..4713612 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java
@@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.io.IOException;
+import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
@@ -27,6 +28,8 @@ import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Function;
+import java.util.stream.Collectors;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
@@ -43,6 +46,8 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
@@ -819,4 +824,128 @@ public abstract class BaseStoredFieldsFormatTestCase extends BaseIndexFileFormat
IOUtils.close(iw, ir, everything);
IOUtils.close(dirs);
}
+
+ public void testRandomStoredFieldsWithIndexSort() throws Exception {
+ final SortField[] sortFields;
+ if (random().nextBoolean()) {
+ sortFields =
+ new SortField[]{
+ new SortField("sort-1", SortField.Type.LONG),
+ new SortField("sort-2", SortField.Type.INT)
+ };
+ } else {
+ sortFields = new SortField[]{new SortField("sort-1", SortField.Type.LONG)};
+ }
+ List<String> storedFields = new ArrayList<>();
+ int numFields = TestUtil.nextInt(random(), 1, 10);
+ for (int i = 0; i < numFields; i++) {
+ storedFields.add("f-" + i);
+ }
+ FieldType storeType = new FieldType(TextField.TYPE_STORED);
+ storeType.setStored(true);
+ Function<String, Document> documentFactory =
+ id -> {
+ Document doc = new Document();
+ doc.add(new StringField("id", id, random().nextBoolean() ? Store.YES : Store.NO));
+ if (random().nextInt(100) <= 5) {
+ Collections.shuffle(storedFields, random());
+ }
+ for (String fieldName : storedFields) {
+ if (random().nextBoolean()) {
+ String s = TestUtil.randomUnicodeString(random(), 100);
+ doc.add(newField(fieldName, s, storeType));
+ }
+ }
+ for (SortField sortField : sortFields) {
+ doc.add(
+ new NumericDocValuesField(
+ sortField.getField(), TestUtil.nextInt(random(), 0, 10000)));
+ }
+ return doc;
+ };
+
+ Map<String, Document> docs = new HashMap<>();
+ int numDocs = atLeast(100);
+ for (int i = 0; i < numDocs; i++) {
+ String id = Integer.toString(i);
+ docs.put(id, documentFactory.apply(id));
+ }
+
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setMaxBufferedDocs(TestUtil.nextInt(random(), 5, 20));
+ iwc.setIndexSort(new Sort(sortFields));
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+ List<String> addedIds = new ArrayList<>();
+ Runnable verifyStoreFields =
+ () -> {
+ try (DirectoryReader reader = maybeWrapWithMergingReader(iw.getReader())) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ int iters = TestUtil.nextInt(random(), 1, 10);
+ for (int i = 0; i < iters; i++) {
+ String testID = addedIds.get(random().nextInt(addedIds.size()));
+ if (VERBOSE) {
+ System.out.println("TEST: test id=" + testID);
+ }
+ TopDocs hits = searcher.search(new TermQuery(new Term("id", testID)), 1);
+ assertEquals(1, hits.totalHits.value);
+ List<IndexableField> expectedFields =
+ docs.get(testID).getFields().stream()
+ .filter(f -> f.fieldType().stored())
+ .collect(Collectors.toList());
+ Document actualDoc = reader.document(hits.scoreDocs[0].doc);
+ assertEquals(expectedFields.size(), actualDoc.getFields().size());
+ for (IndexableField expectedField : expectedFields) {
+ IndexableField[] actualFields = actualDoc.getFields(expectedField.name());
+ assertEquals(1, actualFields.length);
+ assertEquals(expectedField.stringValue(), actualFields[0].stringValue());
+ }
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ };
+ final List<String> ids = new ArrayList<>(docs.keySet());
+ Collections.shuffle(ids, random());
+ for (String id : ids) {
+ if (random().nextInt(100) < 5) {
+ // add via foreign reader
+ IndexWriterConfig otherIwc = newIndexWriterConfig();
+ otherIwc.setIndexSort(new Sort(sortFields));
+ try (Directory otherDir = newDirectory();
+ RandomIndexWriter otherIw = new RandomIndexWriter(random(), otherDir, otherIwc)) {
+ otherIw.addDocument(docs.get(id));
+ try (DirectoryReader otherReader = otherIw.getReader()) {
+ TestUtil.addIndexesSlowly(iw.w, otherReader);
+ }
+ }
+ } else {
+ // add normally
+ iw.addDocument(docs.get(id));
+ }
+ addedIds.add(id);
+ if (random().nextInt(100) < 5) {
+ String deletingId = addedIds.remove(random().nextInt(addedIds.size()));
+ if (random().nextBoolean()) {
+ iw.deleteDocuments(new TermQuery(new Term("id", deletingId)));
+ addedIds.remove(deletingId);
+ } else {
+ final Document newDoc = documentFactory.apply(deletingId);
+ docs.put(deletingId, newDoc);
+ iw.updateDocument(new Term("id", deletingId), newDoc);
+ }
+ }
+ if (random().nextInt(100) < 5) {
+ verifyStoreFields.run();
+ }
+ if (random().nextInt(100) < 2) {
+ iw.forceMerge(TestUtil.nextInt(random(), 1, 3));
+ }
+ }
+ verifyStoreFields.run();
+ iw.forceMerge(TestUtil.nextInt(random(), 1, 3));
+ verifyStoreFields.run();
+ IOUtils.close(iw, dir);
+ }
+
}