You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by pa...@apache.org on 2022/11/22 07:52:46 UTC
[lucene] branch main updated: Support deletions in rearrange (#11815)
This is an automated email from the ASF dual-hosted git repository.
patrickz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 369a70f2894 Support deletions in rearrange (#11815)
369a70f2894 is described below
commit 369a70f28948e748ed157c0731f36c367c4a03fa
Author: Stefan Vodita <41...@users.noreply.github.com>
AuthorDate: Tue Nov 22 07:52:38 2022 +0000
Support deletions in rearrange (#11815)
* Support deletions in rearrange
* Store BinaryDocValues in the binary doc value selector as ByteRef
instead of String.
---
lucene/CHANGES.txt | 2 +
.../lucene/misc/index/BinaryDocValueSelector.java | 83 +++++--
.../apache/lucene/misc/index/IndexRearranger.java | 168 ++++++++++---
...ranger.java => TestBinaryDocValueSelector.java} | 104 ++-------
.../lucene/misc/index/TestIndexRearranger.java | 260 +++++++++++++++------
5 files changed, 414 insertions(+), 203 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5257c959e32..45b1c4944fb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -37,6 +37,8 @@ API Changes
* GITHUB#11840: Query rewrite now takes an IndexSearcher instead of IndexReader to enable concurrent
rewriting. (Patrick Zhai)
+* GITHUB#11814: Support deletions in IndexRearranger. (Stefan Vodita)
+
New Features
---------------------
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
index 082675f4dcc..c1a54b72476 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
@@ -22,6 +22,7 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
@@ -30,52 +31,94 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
-/** Select documents using binary doc values */
+/**
+ * Use this selector to rearrange an index where documents can be uniquely identified based on
+ * {@link BinaryDocValues}
+ */
public class BinaryDocValueSelector implements IndexRearranger.DocumentSelector, Serializable {
private final String field;
- private final HashSet<String> keySet;
+ private final Set<BytesRef> keySet;
- public BinaryDocValueSelector(String field, HashSet<String> keySet) {
+ public BinaryDocValueSelector(String field, Set<BytesRef> keySet) {
this.field = field;
this.keySet = keySet;
}
@Override
- public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+ public BitSet getFilteredDocs(CodecReader reader) throws IOException {
BinaryDocValues binaryDocValues = reader.getBinaryDocValues(field);
- Bits oldLiveDocs = reader.getLiveDocs();
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
- for (int i = 0; i < reader.maxDoc(); i++) {
- if (oldLiveDocs != null && oldLiveDocs.get(i) == false) {
- continue;
- }
- if (binaryDocValues.advanceExact(i)
- && keySet.contains(binaryDocValues.binaryValue().utf8ToString())) {
- bits.set(i);
+ for (int docid = 0; docid < reader.maxDoc(); docid++) {
+ if (binaryDocValues.advanceExact(docid) && keySet.contains(binaryDocValues.binaryValue())) {
+ bits.set(docid);
}
}
return bits;
}
- public static List<IndexRearranger.DocumentSelector> createFromExistingIndex(
+ /**
+ * Create a selector for the deletes in an index, which can then be applied to a rearranged index
+ *
+ * @param field tells which {@link BinaryDocValues} are the unique key
+ * @param directory where the original index is present
+ * @return a deletes selector to be passed to {@link IndexRearranger}
+ */
+ public static IndexRearranger.DocumentSelector createDeleteSelectorFromIndex(
String field, Directory directory) throws IOException {
- List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+
+ Set<BytesRef> keySet = new HashSet<>();
+
try (IndexReader reader = DirectoryReader.open(directory)) {
for (LeafReaderContext context : reader.leaves()) {
- HashSet<String> keySet = new HashSet<>();
Bits liveDocs = context.reader().getLiveDocs();
+ if (liveDocs == null) {
+ continue;
+ }
+
BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field);
- for (int i = 0; i < context.reader().maxDoc(); i++) {
- if (liveDocs != null && liveDocs.get(i) == false) {
+ for (int docid = 0; docid < context.reader().maxDoc(); docid++) {
+ if (liveDocs.get(docid) == true) {
continue;
}
- if (binaryDocValues.advanceExact(i)) {
- keySet.add(binaryDocValues.binaryValue().utf8ToString());
+
+ if (binaryDocValues.advanceExact(docid)) {
+ keySet.add(BytesRef.deepCopyOf(binaryDocValues.binaryValue()));
+ } else {
+ throw new AssertionError("Document " + docid + " doesn't have key " + field);
+ }
+ }
+ }
+ }
+ return new BinaryDocValueSelector(field, keySet);
+ }
+
+ /**
+ * Create a list of selectors that will reproduce the index geometry when used with {@link
+ * IndexRearranger}
+ *
+ * @param field tells which {@link BinaryDocValues} are the unique key
+ * @param directory where the original index is present
+ * @return a list of selectors to be passed to {@link IndexRearranger}
+ */
+ public static List<IndexRearranger.DocumentSelector> createLiveSelectorsFromIndex(
+ String field, Directory directory) throws IOException {
+
+ List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+
+ try (IndexReader reader = DirectoryReader.open(directory)) {
+ for (LeafReaderContext context : reader.leaves()) {
+ Set<BytesRef> keySet = new HashSet<>();
+ BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field);
+
+ for (int docid = 0; docid < context.reader().maxDoc(); docid++) {
+ if (binaryDocValues.advanceExact(docid)) {
+ keySet.add(BytesRef.deepCopyOf(binaryDocValues.binaryValue()));
} else {
- throw new AssertionError("Document don't have selected key");
+ throw new AssertionError("Document " + docid + " doesn't have key " + field);
}
}
selectors.add(new BinaryDocValueSelector(field, keySet));
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
index c05a1a7b959..7c3bd53cece 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@@ -43,7 +44,20 @@ import org.apache.lucene.util.NamedThreadFactory;
/**
* Copy and rearrange index according to document selectors, from input dir to output dir. Length of
- * documentSelectors determines how many segments there will be
+ * documentSelectors determines how many segments there will be.
+ *
+ * <p>Rearranging works in 3 steps: 1. Assume all docs in the original index are live and create the
+ * rearranged index using the segment selectors. 2. Go through the rearranged index and apply
+ * deletes requested by the deletes selector. 3. Reorder the segments to match the order of the
+ * selectors and check the validity of the rearranged index.
+ *
+ * <p>NB: You can't produce segments that only contain deletes. If you select all documents in a
+ * segment for deletion, the entire segment will be discarded.
+ *
+ * <p>Example use case: You are testing search performance after a change to indexing. You can index
+ * the same content using the old and new indexers and then rearrange one of them to the shape of
+ * the other. Using rearrange will give more accurate measurements, since you will not be
+ * introducing noise from index geometry.
*
* <p>TODO: another possible (faster) approach to do this is to manipulate FlushPolicy and
* MergePolicy at indexing time to create small desired segments first and merge them accordingly
@@ -54,59 +68,89 @@ import org.apache.lucene.util.NamedThreadFactory;
public class IndexRearranger {
protected final Directory input, output;
protected final IndexWriterConfig config;
- protected final List<DocumentSelector> documentSelectors;
+
+ // Each of these selectors will produce a segment in the rearranged index.
+ // The segments will appear in the index in the order of the selectors that produced them.
+ protected final List<DocumentSelector> segmentSelectors;
+
+ // Documents selected here will be marked for deletion in the rearranged index, but not merged
+ // away.
+ protected final DocumentSelector deletedDocsSelector;
/**
- * Constructor
+ * All args constructor
*
* @param input input dir
* @param output output dir
* @param config index writer config
- * @param documentSelectors specify what document is desired in the rearranged index segments,
- * each selector correspond to one segment
+ * @param segmentSelectors specify which documents are desired in the rearranged index segments;
+ * each selector corresponds to one segment
+ * @param deletedDocsSelector specify which documents are to be marked for deletion in the
+ * rearranged index; this selector should be thread-safe
*/
public IndexRearranger(
Directory input,
Directory output,
IndexWriterConfig config,
- List<DocumentSelector> documentSelectors) {
+ List<DocumentSelector> segmentSelectors,
+ DocumentSelector deletedDocsSelector) {
this.input = input;
this.output = output;
this.config = config;
- this.documentSelectors = documentSelectors;
+ this.segmentSelectors = segmentSelectors;
+ this.deletedDocsSelector = deletedDocsSelector;
+ }
+
+ /** Constructor with no deletes to apply */
+ public IndexRearranger(
+ Directory input,
+ Directory output,
+ IndexWriterConfig config,
+ List<DocumentSelector> segmentSelectors) {
+ this(input, output, config, segmentSelectors, null);
}
public void execute() throws Exception {
- config.setMergePolicy(
- NoMergePolicy.INSTANCE); // do not merge since one addIndexes call create one segment
- try (IndexWriter writer = new IndexWriter(output, config);
+ ExecutorService executor =
+ Executors.newFixedThreadPool(
+ Math.min(Runtime.getRuntime().availableProcessors(), segmentSelectors.size()),
+ new NamedThreadFactory("rearranger"));
+
+ IndexWriterConfig createSegmentsConfig = new IndexWriterConfig(config.getAnalyzer());
+ IndexWriterConfig applyDeletesConfig = new IndexWriterConfig(config.getAnalyzer());
+
+ // Do not merge - each addIndexes call creates one segment
+ createSegmentsConfig.setMergePolicy(NoMergePolicy.INSTANCE);
+ applyDeletesConfig.setMergePolicy(NoMergePolicy.INSTANCE);
+
+ try (IndexWriter writer = new IndexWriter(output, createSegmentsConfig);
IndexReader reader = DirectoryReader.open(input)) {
- ExecutorService executor =
- Executors.newFixedThreadPool(
- Math.min(Runtime.getRuntime().availableProcessors(), documentSelectors.size()),
- new NamedThreadFactory("rearranger"));
- ArrayList<Future<Void>> futures = new ArrayList<>();
- for (DocumentSelector record : documentSelectors) {
- Callable<Void> addSegment =
- () -> {
- addOneSegment(writer, reader, record);
- return null;
- };
- futures.add(executor.submit(addSegment));
- }
- for (Future<Void> future : futures) {
- future.get();
- }
- executor.shutdown();
+ createRearrangedIndex(writer, reader, segmentSelectors, executor);
}
+ finalizeRearrange(output, segmentSelectors);
+
+ try (IndexWriter writer = new IndexWriter(output, applyDeletesConfig);
+ IndexReader reader = DirectoryReader.open(writer)) {
+ applyDeletes(writer, reader, deletedDocsSelector, executor);
+ }
+ executor.shutdown();
+ }
+
+ /**
+ * Place segments in the order of their respective selectors and ensure the rearrange was
+ * performed correctly.
+ */
+ private static void finalizeRearrange(Directory output, List<DocumentSelector> segmentSelectors)
+ throws IOException {
List<SegmentCommitInfo> ordered = new ArrayList<>();
try (IndexReader reader = DirectoryReader.open(output)) {
- for (DocumentSelector ds : documentSelectors) {
+ for (DocumentSelector ds : segmentSelectors) {
int foundLeaf = -1;
for (LeafReaderContext context : reader.leaves()) {
SegmentReader sr = (SegmentReader) context.reader();
- int docFound = ds.getFilteredLiveDocs(sr).nextSetBit(0);
+ int docFound = ds.getFilteredDocs(sr).nextSetBit(0);
if (docFound != DocIdSetIterator.NO_MORE_DOCS) {
+ // Each document can be mapped to one segment at most
if (foundLeaf != -1) {
throw new IllegalStateException(
"Document selector "
@@ -129,6 +173,32 @@ public class IndexRearranger {
sis.commit(output);
}
+ /**
+ * Create the rearranged index as described by the segment selectors. Assume all documents in the
+ * original index are live.
+ */
+ private static void createRearrangedIndex(
+ IndexWriter writer,
+ IndexReader reader,
+ List<DocumentSelector> selectors,
+ ExecutorService executor)
+ throws ExecutionException, InterruptedException {
+
+ ArrayList<Future<Void>> futures = new ArrayList<>();
+ for (DocumentSelector selector : selectors) {
+ Callable<Void> addSegment =
+ () -> {
+ addOneSegment(writer, reader, selector);
+ return null;
+ };
+ futures.add(executor.submit(addSegment));
+ }
+
+ for (Future<Void> future : futures) {
+ future.get();
+ }
+ }
+
private static void addOneSegment(
IndexWriter writer, IndexReader reader, DocumentSelector selector) throws IOException {
CodecReader[] readers = new CodecReader[reader.leaves().size()];
@@ -139,6 +209,42 @@ public class IndexRearranger {
writer.addIndexes(readers);
}
+ private static void applyDeletes(
+ IndexWriter writer, IndexReader reader, DocumentSelector selector, ExecutorService executor)
+ throws ExecutionException, InterruptedException {
+ if (selector == null) {
+ // There are no deletes to be applied
+ return;
+ }
+
+ ArrayList<Future<Void>> futures = new ArrayList<>();
+ for (LeafReaderContext context : reader.leaves()) {
+ Callable<Void> applyDeletesToSegment =
+ () -> {
+ applyDeletesToOneSegment(writer, (CodecReader) context.reader(), selector);
+ return null;
+ };
+ futures.add(executor.submit(applyDeletesToSegment));
+ }
+
+ for (Future<Void> future : futures) {
+ future.get();
+ }
+ }
+
+ private static void applyDeletesToOneSegment(
+ IndexWriter writer, CodecReader segmentReader, DocumentSelector selector) throws IOException {
+ Bits deletedDocs = selector.getFilteredDocs(segmentReader);
+ for (int docid = 0; docid < segmentReader.maxDoc(); ++docid) {
+ if (deletedDocs.get(docid)) {
+ if (writer.tryDeleteDocument(segmentReader, docid) == -1) {
+ throw new IllegalStateException(
+ "tryDeleteDocument has failed. This should never happen, since merging is disabled.");
+ }
+ }
+ }
+ }
+
private static class DocSelectorFilteredCodecReader extends FilterCodecReader {
BitSet filteredLiveDocs;
@@ -147,7 +253,7 @@ public class IndexRearranger {
public DocSelectorFilteredCodecReader(CodecReader in, DocumentSelector selector)
throws IOException {
super(in);
- filteredLiveDocs = selector.getFilteredLiveDocs(in);
+ filteredLiveDocs = selector.getFilteredDocs(in);
numDocs = filteredLiveDocs.cardinality();
}
@@ -174,6 +280,6 @@ public class IndexRearranger {
/** Select document within a CodecReader */
public interface DocumentSelector {
- BitSet getFilteredLiveDocs(CodecReader reader) throws IOException;
+ BitSet getFilteredDocs(CodecReader reader) throws IOException;
}
}
diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBinaryDocValueSelector.java
similarity index 54%
copy from lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
copy to lucene/misc/src/test/org/apache/lucene/misc/index/TestBinaryDocValueSelector.java
index afa75a3f330..4576c571ba3 100644
--- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
+++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBinaryDocValueSelector.java
@@ -18,11 +18,11 @@
package org.apache.lucene.misc.index;
import java.io.IOException;
-import java.util.List;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
-import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@@ -30,54 +30,21 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.Term;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.util.LuceneTestCase;
-import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.FixedBitSet;
-
-public class TestIndexRearranger extends LuceneTestCase {
- public void testRearrange() throws Exception {
- Directory inputDir = newDirectory();
- createIndex(100, 10, inputDir);
-
- Directory outputDir = newDirectory();
- IndexRearranger rearranger =
- new IndexRearranger(
- inputDir,
- outputDir,
- getIndexWriterConfig(),
- List.of(new OddDocSelector(), new EvenDocSelector()));
- rearranger.execute();
- IndexReader reader = DirectoryReader.open(outputDir);
- assertEquals(2, reader.leaves().size());
- LeafReader segment1 = reader.leaves().get(0).reader();
- assertEquals(50, segment1.numDocs());
- NumericDocValues numericDocValues = segment1.getNumericDocValues("ord");
- assertTrue(numericDocValues.advanceExact(0));
- boolean odd = numericDocValues.longValue() % 2 == 1;
- for (int i = 1; i < 50; i++) {
- assertTrue(numericDocValues.advanceExact(i));
- assertEquals(odd, numericDocValues.longValue() % 2 == 1);
- }
- LeafReader segment2 = reader.leaves().get(1).reader();
- assertEquals(50, segment2.numDocs());
- numericDocValues = segment2.getNumericDocValues("ord");
- assertTrue(numericDocValues.advanceExact(0));
- boolean odd2 = numericDocValues.longValue() % 2 == 1;
- assertTrue(odd != odd2);
- for (int i = 1; i < 50; i++) {
- assertTrue(numericDocValues.advanceExact(i));
- assertEquals(odd2, numericDocValues.longValue() % 2 == 1);
- }
- reader.close();
- inputDir.close();
- outputDir.close();
- }
+/**
+ * This test creates an index with 100 documents and 10 segments where all the documents have a
+ * binary doc value, "textOrd", which is filled with consecutive integers. The documents with even
+ * "textOrd" are marked for deletion. The index is then rearranged into 4 segments using {@link
+ * BinaryDocValueSelector}.
+ */
+public class TestBinaryDocValueSelector extends LuceneTestCase {
public void testRearrangeUsingBinaryDocValueSelector() throws Exception {
Directory srcDir = newDirectory();
createIndex(100, 10, srcDir);
@@ -93,7 +60,8 @@ public class TestIndexRearranger extends LuceneTestCase {
inputDir,
outputDir,
getIndexWriterConfig(),
- BinaryDocValueSelector.createFromExistingIndex("textOrd", srcDir));
+ BinaryDocValueSelector.createLiveSelectorsFromIndex("textOrd", srcDir),
+ BinaryDocValueSelector.createDeleteSelectorFromIndex("textOrd", srcDir));
rearranger.execute();
assertSequentialIndex(outputDir, 100, 10);
@@ -109,11 +77,15 @@ public class TestIndexRearranger extends LuceneTestCase {
for (int i = 0; i < segNum; i++) {
LeafReader leafReader = reader.leaves().get(i).reader();
NumericDocValues numericDocValues = leafReader.getNumericDocValues("ord");
+ Bits liveDocs = leafReader.getLiveDocs();
+ assertNotNull(liveDocs);
- for (int doc = 0; doc < leafReader.numDocs(); doc++) {
+ for (int doc = 0; doc < leafReader.maxDoc(); doc++) {
assertTrue(numericDocValues.advanceExact(doc));
assertEquals(numericDocValues.longValue(), lastOrd + 1);
lastOrd = numericDocValues.longValue();
+
+ assertEquals(liveDocs.get(doc), lastOrd % 2 != 0);
}
}
assertEquals(docNum, lastOrd + 1);
@@ -134,8 +106,12 @@ public class TestIndexRearranger extends LuceneTestCase {
Document doc = new Document();
doc.add(new NumericDocValuesField("ord", i));
doc.add(new BinaryDocValuesField("textOrd", new BytesRef(Integer.toString(i))));
+ if (i % 2 == 0) {
+ doc.add(new StringField("delete", "yes", Field.Store.YES));
+ }
w.addDocument(doc);
if (i % docPerSeg == docPerSeg - 1) {
+ w.deleteDocuments(new Term("delete", "yes"));
w.commit();
}
}
@@ -144,42 +120,4 @@ public class TestIndexRearranger extends LuceneTestCase {
reader.close();
w.close();
}
-
- private static class OddDocSelector implements IndexRearranger.DocumentSelector {
-
- @Override
- public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
- FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
- Bits liveDocs = reader.getLiveDocs();
- NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
- for (int i = 0; i < reader.maxDoc(); i++) {
- if (liveDocs != null && liveDocs.get(i) == false) {
- continue;
- }
- if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 1) {
- filteredSet.set(i);
- }
- }
- return filteredSet;
- }
- }
-
- private static class EvenDocSelector implements IndexRearranger.DocumentSelector {
-
- @Override
- public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
- FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
- Bits liveDocs = reader.getLiveDocs();
- NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
- for (int i = 0; i < reader.maxDoc(); i++) {
- if (liveDocs != null && liveDocs.get(i) == false) {
- continue;
- }
- if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 0) {
- filteredSet.set(i);
- }
- }
- return filteredSet;
- }
- }
}
diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
index afa75a3f330..3907b19f13c 100644
--- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
+++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
@@ -18,10 +18,12 @@
package org.apache.lucene.misc.index;
import java.io.IOException;
+import java.util.Arrays;
import java.util.List;
-import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@@ -30,6 +32,7 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.Term;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.Directory;
@@ -39,10 +42,47 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
+/**
+ * This test creates the following index:
+ *
+ * <p>segment 0: (ord = 0, live = true), (ord = 1, live = false)
+ *
+ * <p>segment 1: (ord = 2, live = true), (ord = 3, live = false)
+ *
+ * <p>segment 4: (ord = 4, live = true), (ord = 5, live = false)
+ *
+ * <p>It also creates 3 selectors:
+ *
+ * <p>{@link LiveToLiveSelector} selects for ord in {0, 2}
+ *
+ * <p>{@link DeleteToLiveSelector} selects for ord in {3, 5}
+ *
+ * <p>{@link DeleteSelector} selects for ord in {1, 2, 4, 5}
+ */
public class TestIndexRearranger extends LuceneTestCase {
- public void testRearrange() throws Exception {
+ /**
+ * Use {@link LiveToLiveSelector} and {@link DeleteToLiveSelector} to rearrange the index into 2
+ * segments:
+ *
+ * <p>segment 0: (ord = 0, live = true), (ord = 2, live = true)
+ *
+ * <p>segment 1: (ord = 3, live = true), (ord = 5, live = true)
+ *
+ * <p>The documents with ord 1 and 4 have now been lost, since they were not selected to be in the
+ * rearranged index. All documents that were selected are now considered live.
+ *
+ * <p>Next, we apply the deletions specified by {@link DeleteSelector}:
+ *
+ * <p>segment 0: (ord = 0, live = true), (ord = 2, live = false)
+ *
+ * <p>segment 1: (ord = 3, live = true), (ord = 5, live = false)
+ *
+ * <p>The documents with ord 2 and 5 have been marked for deletion. Documents 1 and 4 would have
+ * also been marked if they were present.
+ */
+ public void testLiveDeleteCombinations() throws Exception {
Directory inputDir = newDirectory();
- createIndex(100, 10, inputDir);
+ createIndex(inputDir);
Directory outputDir = newDirectory();
IndexRearranger rearranger =
@@ -50,42 +90,55 @@ public class TestIndexRearranger extends LuceneTestCase {
inputDir,
outputDir,
getIndexWriterConfig(),
- List.of(new OddDocSelector(), new EvenDocSelector()));
+ List.of(
+ new TestIndexRearranger.LiveToLiveSelector(),
+ new TestIndexRearranger.DeleteToLiveSelector()),
+ new TestIndexRearranger.DeleteSelector());
rearranger.execute();
+
IndexReader reader = DirectoryReader.open(outputDir);
assertEquals(2, reader.leaves().size());
+
LeafReader segment1 = reader.leaves().get(0).reader();
- assertEquals(50, segment1.numDocs());
+ assertEquals(1, segment1.numDocs());
+ assertEquals(2, segment1.maxDoc());
+ Bits liveDocs = segment1.getLiveDocs();
+ assertNotNull(liveDocs);
NumericDocValues numericDocValues = segment1.getNumericDocValues("ord");
+ assertNotNull(numericDocValues);
assertTrue(numericDocValues.advanceExact(0));
- boolean odd = numericDocValues.longValue() % 2 == 1;
- for (int i = 1; i < 50; i++) {
- assertTrue(numericDocValues.advanceExact(i));
- assertEquals(odd, numericDocValues.longValue() % 2 == 1);
- }
+ assertTrue(liveDocs.get(0));
+ assertEquals(0, numericDocValues.longValue());
+ assertTrue(numericDocValues.advanceExact(1));
+ assertFalse(liveDocs.get(1));
+ assertEquals(2, numericDocValues.longValue());
+
LeafReader segment2 = reader.leaves().get(1).reader();
- assertEquals(50, segment2.numDocs());
+ assertEquals(1, segment2.numDocs());
+ assertEquals(2, segment2.maxDoc());
+ liveDocs = segment1.getLiveDocs();
+ assertNotNull(liveDocs);
numericDocValues = segment2.getNumericDocValues("ord");
+ assertNotNull(numericDocValues);
+ assertTrue(liveDocs.get(0));
assertTrue(numericDocValues.advanceExact(0));
- boolean odd2 = numericDocValues.longValue() % 2 == 1;
- assertTrue(odd != odd2);
- for (int i = 1; i < 50; i++) {
- assertTrue(numericDocValues.advanceExact(i));
- assertEquals(odd2, numericDocValues.longValue() % 2 == 1);
- }
+ assertEquals(3, numericDocValues.longValue());
+ assertFalse(liveDocs.get(1));
+ assertTrue(numericDocValues.advanceExact(1));
+ assertEquals(5, numericDocValues.longValue());
+
reader.close();
inputDir.close();
outputDir.close();
}
- public void testRearrangeUsingBinaryDocValueSelector() throws Exception {
- Directory srcDir = newDirectory();
- createIndex(100, 10, srcDir);
- assertSequentialIndex(srcDir, 100, 10);
-
+ /**
+ * This test arrives at an empty rearranged index by using the same selector for creating segments
+ * and applying deletes.
+ */
+ public void testDeleteEverything() throws Exception {
Directory inputDir = newDirectory();
- createIndex(100, 4, inputDir);
- assertSequentialIndex(inputDir, 100, 4);
+ createIndex(inputDir);
Directory outputDir = newDirectory();
IndexRearranger rearranger =
@@ -93,31 +146,64 @@ public class TestIndexRearranger extends LuceneTestCase {
inputDir,
outputDir,
getIndexWriterConfig(),
- BinaryDocValueSelector.createFromExistingIndex("textOrd", srcDir));
+ List.of(new TestIndexRearranger.LiveToLiveSelector()),
+ new TestIndexRearranger.LiveToLiveSelector());
rearranger.execute();
- assertSequentialIndex(outputDir, 100, 10);
- outputDir.close();
+ IndexReader reader = DirectoryReader.open(outputDir);
+ assertEquals(0, reader.leaves().size());
+
+ reader.close();
inputDir.close();
- srcDir.close();
+ outputDir.close();
}
- private static void assertSequentialIndex(Directory dir, int docNum, int segNum)
- throws IOException {
- IndexReader reader = DirectoryReader.open(dir);
- long lastOrd = -1;
- for (int i = 0; i < segNum; i++) {
- LeafReader leafReader = reader.leaves().get(i).reader();
- NumericDocValues numericDocValues = leafReader.getNumericDocValues("ord");
-
- for (int doc = 0; doc < leafReader.numDocs(); doc++) {
- assertTrue(numericDocValues.advanceExact(doc));
- assertEquals(numericDocValues.longValue(), lastOrd + 1);
- lastOrd = numericDocValues.longValue();
- }
- }
- assertEquals(docNum, lastOrd + 1);
+ /** Don't pass a deletes selector, all selected docs will be live. */
+ public void testDeleteNothing() throws Exception {
+ Directory inputDir = newDirectory();
+ createIndex(inputDir);
+
+ Directory outputDir = newDirectory();
+ IndexRearranger rearranger =
+ new IndexRearranger(
+ inputDir,
+ outputDir,
+ getIndexWriterConfig(),
+ List.of(
+ new TestIndexRearranger.LiveToLiveSelector(),
+ new TestIndexRearranger.DeleteToLiveSelector()));
+ rearranger.execute();
+
+ IndexReader reader = DirectoryReader.open(outputDir);
+ assertEquals(2, reader.leaves().size());
+
+ LeafReader segment1 = reader.leaves().get(0).reader();
+ assertEquals(2, segment1.numDocs());
+ assertEquals(2, segment1.maxDoc());
+ Bits liveDocs = segment1.getLiveDocs();
+ assertNull(liveDocs);
+ NumericDocValues numericDocValues = segment1.getNumericDocValues("ord");
+ assertNotNull(numericDocValues);
+ assertTrue(numericDocValues.advanceExact(0));
+ assertEquals(0, numericDocValues.longValue());
+ assertTrue(numericDocValues.advanceExact(1));
+ assertEquals(2, numericDocValues.longValue());
+
+ LeafReader segment2 = reader.leaves().get(1).reader();
+ assertEquals(2, segment2.numDocs());
+ assertEquals(2, segment2.maxDoc());
+ liveDocs = segment1.getLiveDocs();
+ assertNull(liveDocs);
+ numericDocValues = segment2.getNumericDocValues("ord");
+ assertNotNull(numericDocValues);
+ assertTrue(numericDocValues.advanceExact(0));
+ assertEquals(3, numericDocValues.longValue());
+ assertTrue(numericDocValues.advanceExact(1));
+ assertEquals(5, numericDocValues.longValue());
+
reader.close();
+ inputDir.close();
+ outputDir.close();
}
private static IndexWriterConfig getIndexWriterConfig() {
@@ -127,56 +213,92 @@ public class TestIndexRearranger extends LuceneTestCase {
.setIndexSort(new Sort(new SortField("ord", SortField.Type.INT)));
}
- private static void createIndex(int docNum, int segNum, Directory dir) throws IOException {
+ private static void createIndex(Directory dir) throws IOException {
IndexWriter w = new IndexWriter(dir, getIndexWriterConfig());
- int docPerSeg = (int) Math.ceil((double) docNum / segNum);
- for (int i = 0; i < docNum; i++) {
+ for (int i = 0; i < 6; i++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("ord", i));
- doc.add(new BinaryDocValuesField("textOrd", new BytesRef(Integer.toString(i))));
+ if (i % 2 == 1) {
+ doc.add(new StringField("delete", new BytesRef("yes"), Field.Store.YES));
+ }
w.addDocument(doc);
- if (i % docPerSeg == docPerSeg - 1) {
+ if (i % 2 == 1) {
+ w.deleteDocuments(new Term("delete", "yes"));
w.commit();
}
}
- IndexReader reader = DirectoryReader.open(w);
- assertEquals(segNum, reader.leaves().size());
- reader.close();
+
+ assertCreatedIndex(dir);
w.close();
}
- private static class OddDocSelector implements IndexRearranger.DocumentSelector {
+ private static void assertCreatedIndex(Directory dir) throws IOException {
+ IndexReader reader = DirectoryReader.open(dir);
+ assertEquals(3, reader.leaves().size());
+
+ for (int i = 0; i < 3; i++) {
+ LeafReader segmentReader = reader.leaves().get(i).reader();
+ assertEquals(1, segmentReader.numDocs());
+ assertEquals(2, segmentReader.maxDoc());
+ Bits liveDocs = segmentReader.getLiveDocs();
+ assertNotNull(liveDocs);
+ assertTrue(liveDocs.get(0));
+ assertFalse(liveDocs.get(1));
+
+ NumericDocValues ord = segmentReader.getNumericDocValues("ord");
+ assertNotNull(ord);
+ assertTrue(ord.advanceExact(0));
+ assertEquals(2 * i, ord.longValue());
+ assertTrue(ord.advanceExact(1));
+ assertEquals(2 * i + 1, ord.longValue());
+ }
+
+ reader.close();
+ }
+
+ private static class LiveToLiveSelector implements IndexRearranger.DocumentSelector {
@Override
- public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+ public BitSet getFilteredDocs(CodecReader reader) throws IOException {
FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
- Bits liveDocs = reader.getLiveDocs();
NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
- for (int i = 0; i < reader.maxDoc(); i++) {
- if (liveDocs != null && liveDocs.get(i) == false) {
- continue;
- }
- if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 1) {
- filteredSet.set(i);
+ assert numericDocValues != null;
+ for (int docid = 0; docid < reader.maxDoc(); docid++) {
+ if (numericDocValues.advanceExact(docid)
+ && Arrays.asList(0, 2).contains((int) numericDocValues.longValue())) {
+ filteredSet.set(docid);
}
}
return filteredSet;
}
}
- private static class EvenDocSelector implements IndexRearranger.DocumentSelector {
-
+ private static class DeleteToLiveSelector implements IndexRearranger.DocumentSelector {
@Override
- public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+ public BitSet getFilteredDocs(CodecReader reader) throws IOException {
FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
- Bits liveDocs = reader.getLiveDocs();
NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
- for (int i = 0; i < reader.maxDoc(); i++) {
- if (liveDocs != null && liveDocs.get(i) == false) {
- continue;
+ assert numericDocValues != null;
+ for (int docid = 0; docid < reader.maxDoc(); docid++) {
+ if (numericDocValues.advanceExact(docid)
+ && Arrays.asList(3, 5).contains((int) numericDocValues.longValue())) {
+ filteredSet.set(docid);
}
- if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 0) {
- filteredSet.set(i);
+ }
+ return filteredSet;
+ }
+ }
+
+ private static class DeleteSelector implements IndexRearranger.DocumentSelector {
+ @Override
+ public BitSet getFilteredDocs(CodecReader reader) throws IOException {
+ FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
+ NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
+ assert numericDocValues != null;
+ for (int docid = 0; docid < reader.maxDoc(); docid++) {
+ if (numericDocValues.advanceExact(docid)
+ && Arrays.asList(1, 2, 4, 5).contains((int) numericDocValues.longValue())) {
+ filteredSet.set(docid);
}
}
return filteredSet;