You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/01/29 20:26:55 UTC
[lucene-solr] 01/03: LUCENE-9694: New tool for creating a
deterministic index (#2246)
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 3da86cb5b368c7325ac89be709b17f9c65eeafdf
Author: Patrick Zhai <zh...@users.noreply.github.com>
AuthorDate: Fri Jan 29 10:32:24 2021 -0800
LUCENE-9694: New tool for creating a deterministic index (#2246)
---
lucene/CHANGES.txt | 8 +
.../lucene/misc/index/BinaryDocValueSelector.java | 86 ++++++++++
.../apache/lucene/misc/index/IndexRearranger.java | 136 +++++++++++++++
.../lucene/misc/index/TestIndexRearranger.java | 191 +++++++++++++++++++++
4 files changed, 421 insertions(+)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index abc588d..9d8eed9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -29,6 +29,14 @@ Other
---------------------
(No changes)
+======================= Lucene 8.9.0 =======================
+
+New Features
+---------------------
+
+* LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
+ on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai)
+
======================= Lucene 8.8.0 =======================
API Changes
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
new file mode 100644
index 0000000..082675f
--- /dev/null
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc.index;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+
+/** Select documents using binary doc values */
+public class BinaryDocValueSelector implements IndexRearranger.DocumentSelector, Serializable {
+
+ private final String field;
+ private final HashSet<String> keySet;
+
+ public BinaryDocValueSelector(String field, HashSet<String> keySet) {
+ this.field = field;
+ this.keySet = keySet;
+ }
+
+ @Override
+ public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+ BinaryDocValues binaryDocValues = reader.getBinaryDocValues(field);
+ Bits oldLiveDocs = reader.getLiveDocs();
+ FixedBitSet bits = new FixedBitSet(reader.maxDoc());
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (oldLiveDocs != null && oldLiveDocs.get(i) == false) {
+ continue;
+ }
+ if (binaryDocValues.advanceExact(i)
+ && keySet.contains(binaryDocValues.binaryValue().utf8ToString())) {
+ bits.set(i);
+ }
+ }
+ return bits;
+ }
+
+ public static List<IndexRearranger.DocumentSelector> createFromExistingIndex(
+ String field, Directory directory) throws IOException {
+ List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+ try (IndexReader reader = DirectoryReader.open(directory)) {
+ for (LeafReaderContext context : reader.leaves()) {
+ HashSet<String> keySet = new HashSet<>();
+ Bits liveDocs = context.reader().getLiveDocs();
+ BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field);
+ for (int i = 0; i < context.reader().maxDoc(); i++) {
+ if (liveDocs != null && liveDocs.get(i) == false) {
+ continue;
+ }
+ if (binaryDocValues.advanceExact(i)) {
+ keySet.add(binaryDocValues.binaryValue().utf8ToString());
+ } else {
+ throw new AssertionError("Document don't have selected key");
+ }
+ }
+ selectors.add(new BinaryDocValueSelector(field, keySet));
+ }
+ }
+ return selectors;
+ }
+}
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
new file mode 100644
index 0000000..194cd69
--- /dev/null
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc.index;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FilterCodecReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.NamedThreadFactory;
+
+/**
+ * Copy and rearrange index according to document selectors, from input dir to output dir. Length of
+ * documentSelectors determines how many segments there will be
+ *
+ * <p>TODO: another possible (faster) approach to do this is to manipulate FlushPolicy and
+ * MergePolicy at indexing time to create small desired segments first and merge them accordingly
+ * for details please see: https://markmail.org/message/lbtdntclpnocmfuf
+ */
+public class IndexRearranger {
+ protected final Directory input, output;
+ protected final IndexWriterConfig config;
+ protected final List<DocumentSelector> documentSelectors;
+
+ public IndexRearranger(
+ Directory input,
+ Directory output,
+ IndexWriterConfig config,
+ List<DocumentSelector> documentSelectors) {
+ this.input = input;
+ this.output = output;
+ this.config = config;
+ this.documentSelectors = documentSelectors;
+ }
+
+ public void execute() throws Exception {
+ config.setMergePolicy(
+ NoMergePolicy.INSTANCE); // do not merge since one addIndexes call create one segment
+ try (IndexWriter writer = new IndexWriter(output, config);
+ IndexReader reader = DirectoryReader.open(input)) {
+ ExecutorService executor =
+ Executors.newFixedThreadPool(
+ Math.min(Runtime.getRuntime().availableProcessors(), documentSelectors.size()),
+ new NamedThreadFactory("rearranger"));
+ ArrayList<Future<Void>> futures = new ArrayList<>();
+ for (DocumentSelector record : documentSelectors) {
+ Callable<Void> addSegment =
+ () -> {
+ addOneSegment(writer, reader, record);
+ return null;
+ };
+ futures.add(executor.submit(addSegment));
+ }
+ for (Future<Void> future : futures) {
+ future.get();
+ }
+ executor.shutdown();
+ }
+ }
+
+ private static void addOneSegment(
+ IndexWriter writer, IndexReader reader, DocumentSelector selector) throws IOException {
+ CodecReader[] readers = new CodecReader[reader.leaves().size()];
+ for (LeafReaderContext context : reader.leaves()) {
+ readers[context.ord] =
+ new DocSelectorFilteredCodecReader((CodecReader) context.reader(), selector);
+ }
+ writer.addIndexes(readers);
+ }
+
+ private static class DocSelectorFilteredCodecReader extends FilterCodecReader {
+
+ BitSet filteredLiveDocs;
+ int numDocs;
+
+ public DocSelectorFilteredCodecReader(CodecReader in, DocumentSelector selector)
+ throws IOException {
+ super(in);
+ filteredLiveDocs = selector.getFilteredLiveDocs(in);
+ numDocs = filteredLiveDocs.cardinality();
+ }
+
+ @Override
+ public int numDocs() {
+ return numDocs;
+ }
+
+ @Override
+ public Bits getLiveDocs() {
+ return filteredLiveDocs;
+ }
+
+ @Override
+ public CacheHelper getCoreCacheHelper() {
+ return in.getCoreCacheHelper();
+ }
+
+ @Override
+ public CacheHelper getReaderCacheHelper() {
+ return null;
+ }
+ }
+
+ /** Select document within a CodecReader */
+ interface DocumentSelector {
+ BitSet getFilteredLiveDocs(CodecReader reader) throws IOException;
+ }
+}
diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
new file mode 100644
index 0000000..fa111a6
--- /dev/null
+++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc.index;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestIndexRearranger extends LuceneTestCase {
+ public void testRearrange() throws Exception {
+ Directory inputDir = newDirectory();
+ createIndex(100, 10, inputDir);
+
+ Directory outputDir = newDirectory();
+ IndexRearranger rearranger =
+ new IndexRearranger(
+ inputDir,
+ outputDir,
+ getIndexWriterConfig(),
+ List.of(new OddDocSelector(), new EvenDocSelector()));
+ rearranger.execute();
+ IndexReader reader = DirectoryReader.open(outputDir);
+ assertEquals(2, reader.leaves().size());
+ LeafReader segment1 = reader.leaves().get(0).reader();
+ assertEquals(50, segment1.numDocs());
+ NumericDocValues numericDocValues = segment1.getNumericDocValues("ord");
+ assertTrue(numericDocValues.advanceExact(0));
+ boolean odd = numericDocValues.longValue() % 2 == 1;
+ for (int i = 1; i < 50; i++) {
+ assertTrue(numericDocValues.advanceExact(i));
+ assertEquals(odd, numericDocValues.longValue() % 2 == 1);
+ }
+ LeafReader segment2 = reader.leaves().get(1).reader();
+ assertEquals(50, segment2.numDocs());
+ numericDocValues = segment2.getNumericDocValues("ord");
+ assertTrue(numericDocValues.advanceExact(0));
+ boolean odd2 = numericDocValues.longValue() % 2 == 1;
+ assertTrue(odd != odd2);
+ for (int i = 1; i < 50; i++) {
+ assertTrue(numericDocValues.advanceExact(i));
+ assertEquals(odd2, numericDocValues.longValue() % 2 == 1);
+ }
+ reader.close();
+ inputDir.close();
+ outputDir.close();
+ }
+
+ public void testRearrangeUsingBinaryDocValueSelector() throws Exception {
+ Directory srcDir = newDirectory();
+ createIndex(100, 10, srcDir);
+ assertSequentialIndex(srcDir, 100, 10);
+
+ Directory inputDir = newDirectory();
+ createIndex(100, 4, inputDir);
+ assertSequentialIndex(inputDir, 100, 4);
+
+ Directory outputDir = newDirectory();
+ IndexRearranger rearranger =
+ new IndexRearranger(
+ inputDir,
+ outputDir,
+ getIndexWriterConfig(),
+ BinaryDocValueSelector.createFromExistingIndex("textOrd", srcDir));
+ rearranger.execute();
+ assertSequentialIndex(outputDir, 100, 10);
+
+ outputDir.close();
+ inputDir.close();
+ srcDir.close();
+ }
+
+ private static void assertSequentialIndex(Directory dir, int docNum, int segNum)
+ throws IOException {
+ HashSet<Long> seenOrds = new HashSet<>();
+ IndexReader reader = DirectoryReader.open(dir);
+ for (int i = 0; i < segNum; i++) {
+ LeafReader leafReader = reader.leaves().get(i).reader();
+ NumericDocValues numericDocValues = leafReader.getNumericDocValues("ord");
+
+ assertTrue(numericDocValues.advanceExact(0));
+ long lastOrd = numericDocValues.longValue();
+ seenOrds.add(lastOrd);
+
+ for (int doc = 1; doc < leafReader.numDocs(); doc++) {
+ assertTrue(numericDocValues.advanceExact(doc));
+ assertEquals(numericDocValues.longValue(), lastOrd + 1);
+ lastOrd = numericDocValues.longValue();
+ seenOrds.add(lastOrd);
+ }
+ }
+ assertEquals(docNum, seenOrds.size());
+ reader.close();
+ }
+
+ private static IndexWriterConfig getIndexWriterConfig() {
+ return new IndexWriterConfig(null)
+ .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
+ .setMergePolicy(NoMergePolicy.INSTANCE)
+ .setIndexSort(new Sort(new SortField("ord", SortField.Type.INT)));
+ }
+
+ private static void createIndex(int docNum, int segNum, Directory dir) throws IOException {
+ IndexWriter w = new IndexWriter(dir, getIndexWriterConfig());
+ int docPerSeg = (int) Math.ceil((double) docNum / segNum);
+ for (int i = 0; i < docNum; i++) {
+ Document doc = new Document();
+ doc.add(new NumericDocValuesField("ord", i));
+ doc.add(new BinaryDocValuesField("textOrd", new BytesRef(Integer.toString(i))));
+ w.addDocument(doc);
+ if (i % docPerSeg == docPerSeg - 1) {
+ w.commit();
+ }
+ }
+ IndexReader reader = DirectoryReader.open(w);
+ assertEquals(segNum, reader.leaves().size());
+ reader.close();
+ w.close();
+ }
+
+ private class OddDocSelector implements IndexRearranger.DocumentSelector {
+
+ @Override
+ public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+ FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
+ Bits liveDocs = reader.getLiveDocs();
+ NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (liveDocs != null && liveDocs.get(i) == false) {
+ continue;
+ }
+ if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 1) {
+ filteredSet.set(i);
+ }
+ }
+ return filteredSet;
+ }
+ }
+
+ private class EvenDocSelector implements IndexRearranger.DocumentSelector {
+
+ @Override
+ public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+ FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
+ Bits liveDocs = reader.getLiveDocs();
+ NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (liveDocs != null && liveDocs.get(i) == false) {
+ continue;
+ }
+ if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 0) {
+ filteredSet.set(i);
+ }
+ }
+ return filteredSet;
+ }
+ }
+}