You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/01/29 20:26:55 UTC
[lucene-solr] 01/03: LUCENE-9694: New tool for creating a deterministic index (#2246)

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 3da86cb5b368c7325ac89be709b17f9c65eeafdf
Author: Patrick Zhai <zh...@users.noreply.github.com>
AuthorDate: Fri Jan 29 10:32:24 2021 -0800

    LUCENE-9694: New tool for creating a deterministic index (#2246)
---
 lucene/CHANGES.txt                                 |   8 +
 .../lucene/misc/index/BinaryDocValueSelector.java  |  86 ++++++++++
 .../apache/lucene/misc/index/IndexRearranger.java  | 136 +++++++++++++++
 .../lucene/misc/index/TestIndexRearranger.java     | 191 +++++++++++++++++++++
 4 files changed, 421 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index abc588d..9d8eed9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -29,6 +29,14 @@ Other
 ---------------------
 (No changes)
 
+======================= Lucene 8.9.0 =======================
+
+New Features
+---------------------
+
+* LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
+  on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai)
+
 ======================= Lucene 8.8.0 =======================
 
 API Changes
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
new file mode 100644
index 0000000..082675f
--- /dev/null
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BinaryDocValueSelector.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc.index;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.FixedBitSet;
+
+/** Select documents using binary doc values */
+public class BinaryDocValueSelector implements IndexRearranger.DocumentSelector, Serializable {
+
+  private final String field;
+  private final HashSet<String> keySet;
+
+  public BinaryDocValueSelector(String field, HashSet<String> keySet) {
+    this.field = field;
+    this.keySet = keySet;
+  }
+
+  @Override
+  public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+    BinaryDocValues binaryDocValues = reader.getBinaryDocValues(field);
+    Bits oldLiveDocs = reader.getLiveDocs();
+    FixedBitSet bits = new FixedBitSet(reader.maxDoc());
+    for (int i = 0; i < reader.maxDoc(); i++) {
+      if (oldLiveDocs != null && oldLiveDocs.get(i) == false) {
+        continue;
+      }
+      if (binaryDocValues.advanceExact(i)
+          && keySet.contains(binaryDocValues.binaryValue().utf8ToString())) {
+        bits.set(i);
+      }
+    }
+    return bits;
+  }
+
+  public static List<IndexRearranger.DocumentSelector> createFromExistingIndex(
+      String field, Directory directory) throws IOException {
+    List<IndexRearranger.DocumentSelector> selectors = new ArrayList<>();
+    try (IndexReader reader = DirectoryReader.open(directory)) {
+      for (LeafReaderContext context : reader.leaves()) {
+        HashSet<String> keySet = new HashSet<>();
+        Bits liveDocs = context.reader().getLiveDocs();
+        BinaryDocValues binaryDocValues = context.reader().getBinaryDocValues(field);
+        for (int i = 0; i < context.reader().maxDoc(); i++) {
+          if (liveDocs != null && liveDocs.get(i) == false) {
+            continue;
+          }
+          if (binaryDocValues.advanceExact(i)) {
+            keySet.add(binaryDocValues.binaryValue().utf8ToString());
+          } else {
+            throw new AssertionError("Document don't have selected key");
+          }
+        }
+        selectors.add(new BinaryDocValueSelector(field, keySet));
+      }
+    }
+    return selectors;
+  }
+}
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
new file mode 100644
index 0000000..194cd69
--- /dev/null
+++ b/lucene/misc/src/java/org/apache/lucene/misc/index/IndexRearranger.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc.index;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FilterCodecReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.NamedThreadFactory;
+
+/**
+ * Copy and rearrange index according to document selectors, from input dir to output dir. Length of
+ * documentSelectors determines how many segments there will be
+ *
+ * <p>TODO: another possible (faster) approach to do this is to manipulate FlushPolicy and
+ * MergePolicy at indexing time to create small desired segments first and merge them accordingly
+ * for details please see: https://markmail.org/message/lbtdntclpnocmfuf
+ */
+public class IndexRearranger {
+  protected final Directory input, output;
+  protected final IndexWriterConfig config;
+  protected final List<DocumentSelector> documentSelectors;
+
+  public IndexRearranger(
+      Directory input,
+      Directory output,
+      IndexWriterConfig config,
+      List<DocumentSelector> documentSelectors) {
+    this.input = input;
+    this.output = output;
+    this.config = config;
+    this.documentSelectors = documentSelectors;
+  }
+
+  public void execute() throws Exception {
+    config.setMergePolicy(
+        NoMergePolicy.INSTANCE); // do not merge since one addIndexes call create one segment
+    try (IndexWriter writer = new IndexWriter(output, config);
+        IndexReader reader = DirectoryReader.open(input)) {
+      ExecutorService executor =
+          Executors.newFixedThreadPool(
+              Math.min(Runtime.getRuntime().availableProcessors(), documentSelectors.size()),
+              new NamedThreadFactory("rearranger"));
+      ArrayList<Future<Void>> futures = new ArrayList<>();
+      for (DocumentSelector record : documentSelectors) {
+        Callable<Void> addSegment =
+            () -> {
+              addOneSegment(writer, reader, record);
+              return null;
+            };
+        futures.add(executor.submit(addSegment));
+      }
+      for (Future<Void> future : futures) {
+        future.get();
+      }
+      executor.shutdown();
+    }
+  }
+
+  private static void addOneSegment(
+      IndexWriter writer, IndexReader reader, DocumentSelector selector) throws IOException {
+    CodecReader[] readers = new CodecReader[reader.leaves().size()];
+    for (LeafReaderContext context : reader.leaves()) {
+      readers[context.ord] =
+          new DocSelectorFilteredCodecReader((CodecReader) context.reader(), selector);
+    }
+    writer.addIndexes(readers);
+  }
+
+  private static class DocSelectorFilteredCodecReader extends FilterCodecReader {
+
+    BitSet filteredLiveDocs;
+    int numDocs;
+
+    public DocSelectorFilteredCodecReader(CodecReader in, DocumentSelector selector)
+        throws IOException {
+      super(in);
+      filteredLiveDocs = selector.getFilteredLiveDocs(in);
+      numDocs = filteredLiveDocs.cardinality();
+    }
+
+    @Override
+    public int numDocs() {
+      return numDocs;
+    }
+
+    @Override
+    public Bits getLiveDocs() {
+      return filteredLiveDocs;
+    }
+
+    @Override
+    public CacheHelper getCoreCacheHelper() {
+      return in.getCoreCacheHelper();
+    }
+
+    @Override
+    public CacheHelper getReaderCacheHelper() {
+      return null;
+    }
+  }
+
+  /** Select document within a CodecReader */
+  interface DocumentSelector {
+    BitSet getFilteredLiveDocs(CodecReader reader) throws IOException;
+  }
+}
diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
new file mode 100644
index 0000000..fa111a6
--- /dev/null
+++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestIndexRearranger.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc.index;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestIndexRearranger extends LuceneTestCase {
+  public void testRearrange() throws Exception {
+    Directory inputDir = newDirectory();
+    createIndex(100, 10, inputDir);
+
+    Directory outputDir = newDirectory();
+    IndexRearranger rearranger =
+        new IndexRearranger(
+            inputDir,
+            outputDir,
+            getIndexWriterConfig(),
+            List.of(new OddDocSelector(), new EvenDocSelector()));
+    rearranger.execute();
+    IndexReader reader = DirectoryReader.open(outputDir);
+    assertEquals(2, reader.leaves().size());
+    LeafReader segment1 = reader.leaves().get(0).reader();
+    assertEquals(50, segment1.numDocs());
+    NumericDocValues numericDocValues = segment1.getNumericDocValues("ord");
+    assertTrue(numericDocValues.advanceExact(0));
+    boolean odd = numericDocValues.longValue() % 2 == 1;
+    for (int i = 1; i < 50; i++) {
+      assertTrue(numericDocValues.advanceExact(i));
+      assertEquals(odd, numericDocValues.longValue() % 2 == 1);
+    }
+    LeafReader segment2 = reader.leaves().get(1).reader();
+    assertEquals(50, segment2.numDocs());
+    numericDocValues = segment2.getNumericDocValues("ord");
+    assertTrue(numericDocValues.advanceExact(0));
+    boolean odd2 = numericDocValues.longValue() % 2 == 1;
+    assertTrue(odd != odd2);
+    for (int i = 1; i < 50; i++) {
+      assertTrue(numericDocValues.advanceExact(i));
+      assertEquals(odd2, numericDocValues.longValue() % 2 == 1);
+    }
+    reader.close();
+    inputDir.close();
+    outputDir.close();
+  }
+
+  public void testRearrangeUsingBinaryDocValueSelector() throws Exception {
+    Directory srcDir = newDirectory();
+    createIndex(100, 10, srcDir);
+    assertSequentialIndex(srcDir, 100, 10);
+
+    Directory inputDir = newDirectory();
+    createIndex(100, 4, inputDir);
+    assertSequentialIndex(inputDir, 100, 4);
+
+    Directory outputDir = newDirectory();
+    IndexRearranger rearranger =
+        new IndexRearranger(
+            inputDir,
+            outputDir,
+            getIndexWriterConfig(),
+            BinaryDocValueSelector.createFromExistingIndex("textOrd", srcDir));
+    rearranger.execute();
+    assertSequentialIndex(outputDir, 100, 10);
+
+    outputDir.close();
+    inputDir.close();
+    srcDir.close();
+  }
+
+  private static void assertSequentialIndex(Directory dir, int docNum, int segNum)
+      throws IOException {
+    HashSet<Long> seenOrds = new HashSet<>();
+    IndexReader reader = DirectoryReader.open(dir);
+    for (int i = 0; i < segNum; i++) {
+      LeafReader leafReader = reader.leaves().get(i).reader();
+      NumericDocValues numericDocValues = leafReader.getNumericDocValues("ord");
+
+      assertTrue(numericDocValues.advanceExact(0));
+      long lastOrd = numericDocValues.longValue();
+      seenOrds.add(lastOrd);
+
+      for (int doc = 1; doc < leafReader.numDocs(); doc++) {
+        assertTrue(numericDocValues.advanceExact(doc));
+        assertEquals(numericDocValues.longValue(), lastOrd + 1);
+        lastOrd = numericDocValues.longValue();
+        seenOrds.add(lastOrd);
+      }
+    }
+    assertEquals(docNum, seenOrds.size());
+    reader.close();
+  }
+
+  private static IndexWriterConfig getIndexWriterConfig() {
+    return new IndexWriterConfig(null)
+        .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
+        .setMergePolicy(NoMergePolicy.INSTANCE)
+        .setIndexSort(new Sort(new SortField("ord", SortField.Type.INT)));
+  }
+
+  private static void createIndex(int docNum, int segNum, Directory dir) throws IOException {
+    IndexWriter w = new IndexWriter(dir, getIndexWriterConfig());
+    int docPerSeg = (int) Math.ceil((double) docNum / segNum);
+    for (int i = 0; i < docNum; i++) {
+      Document doc = new Document();
+      doc.add(new NumericDocValuesField("ord", i));
+      doc.add(new BinaryDocValuesField("textOrd", new BytesRef(Integer.toString(i))));
+      w.addDocument(doc);
+      if (i % docPerSeg == docPerSeg - 1) {
+        w.commit();
+      }
+    }
+    IndexReader reader = DirectoryReader.open(w);
+    assertEquals(segNum, reader.leaves().size());
+    reader.close();
+    w.close();
+  }
+
+  private class OddDocSelector implements IndexRearranger.DocumentSelector {
+
+    @Override
+    public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+      FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
+      Bits liveDocs = reader.getLiveDocs();
+      NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
+      for (int i = 0; i < reader.maxDoc(); i++) {
+        if (liveDocs != null && liveDocs.get(i) == false) {
+          continue;
+        }
+        if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 1) {
+          filteredSet.set(i);
+        }
+      }
+      return filteredSet;
+    }
+  }
+
+  private class EvenDocSelector implements IndexRearranger.DocumentSelector {
+
+    @Override
+    public BitSet getFilteredLiveDocs(CodecReader reader) throws IOException {
+      FixedBitSet filteredSet = new FixedBitSet(reader.maxDoc());
+      Bits liveDocs = reader.getLiveDocs();
+      NumericDocValues numericDocValues = reader.getNumericDocValues("ord");
+      for (int i = 0; i < reader.maxDoc(); i++) {
+        if (liveDocs != null && liveDocs.get(i) == false) {
+          continue;
+        }
+        if (numericDocValues.advanceExact(i) && numericDocValues.longValue() % 2 == 0) {
+          filteredSet.set(i);
+        }
+      }
+      return filteredSet;
+    }
+  }
+}