You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/01/05 15:28:05 UTC
[lucene] branch main updated: LUCENE-10291: Only read/write postings when there is at least one indexed field (#539)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 8fa7412  LUCENE-10291: Only read/write postings when there is at least one indexed field (#539)
8fa7412 is described below

commit 8fa7412dec458e42f379cc856bd6ffebe8c6f8e9
Author: Yannick Welsch <ya...@welsch.lu>
AuthorDate: Wed Jan 5 16:28:00 2022 +0100

    LUCENE-10291: Only read/write postings when there is at least one indexed field (#539)
---
 .../org/apache/lucene/codecs/FieldsConsumer.java   |   8 +-
 .../lucene/codecs/perfield/PerFieldMergeState.java |  14 +-
 .../codecs/perfield/PerFieldPostingsFormat.java    |   2 +
 .../java/org/apache/lucene/index/CheckIndex.java   |  19 ++-
 .../java/org/apache/lucene/index/CodecReader.java  |  10 +-
 .../java/org/apache/lucene/index/FieldInfos.java   |   9 ++
 .../apache/lucene/index/FreqProxTermsWriter.java   |   5 +
 .../java/org/apache/lucene/index/MergeState.java   |   6 +-
 .../org/apache/lucene/index/SegmentCommitInfo.java |   4 +-
 .../apache/lucene/index/SegmentCoreReaders.java    |  12 +-
 .../org/apache/lucene/index/SegmentMerger.java     |   6 +-
 .../services/org.apache.lucene.codecs.Codec        |  17 +++
 .../org/apache/lucene/codecs/TestMinimalCodec.java | 162 +++++++++++++++++++++
 13 files changed, 257 insertions(+), 17 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
index 0659827..cf0f796 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/FieldsConsumer.java
@@ -79,9 +79,11 @@ public abstract class FieldsConsumer implements Closeable {
       final FieldsProducer f = mergeState.fieldsProducers[readerIndex];
 
       final int maxDoc = mergeState.maxDocs[readerIndex];
-      f.checkIntegrity();
-      slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
-      fields.add(f);
+      if (f != null) {
+        f.checkIntegrity();
+        slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
+        fields.add(f);
+      }
       docBase += maxDoc;
     }
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java
index 47fe0df..c340448 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldMergeState.java
@@ -79,7 +79,10 @@ final class PerFieldMergeState {
       in.fieldInfos[i] = new FilterFieldInfos(orgFieldInfos[i], fields);
     }
     for (int i = 0; i < orgFieldsProducers.length; i++) {
-      in.fieldsProducers[i] = new FilterFieldsProducer(orgFieldsProducers[i], fields);
+      in.fieldsProducers[i] =
+          orgFieldsProducers[i] == null
+              ? null
+              : new FilterFieldsProducer(orgFieldsProducers[i], fields);
     }
     return in;
   }
@@ -103,6 +106,7 @@ final class PerFieldMergeState {
     // Copy of the private fields from FieldInfos
     // Renamed so as to be less confusing about which fields we're referring to
     private final boolean filteredHasVectors;
+    private final boolean filteredHasPostings;
     private final boolean filteredHasProx;
     private final boolean filteredHasPayloads;
     private final boolean filteredHasOffsets;
@@ -116,6 +120,7 @@ final class PerFieldMergeState {
       super(toArray(src));
 
       boolean hasVectors = false;
+      boolean hasPostings = false;
       boolean hasProx = false;
       boolean hasPayloads = false;
       boolean hasOffsets = false;
@@ -130,6 +135,7 @@ final class PerFieldMergeState {
         if (this.filteredNames.contains(fi.name)) {
           this.filtered.add(fi);
           hasVectors |= fi.hasVectors();
+          hasPostings |= fi.getIndexOptions() != IndexOptions.NONE;
           hasProx |= fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
           hasFreq |= fi.getIndexOptions() != IndexOptions.DOCS;
           hasOffsets |=
@@ -143,6 +149,7 @@ final class PerFieldMergeState {
       }
 
       this.filteredHasVectors = hasVectors;
+      this.filteredHasPostings = hasPostings;
       this.filteredHasProx = hasProx;
       this.filteredHasPayloads = hasPayloads;
       this.filteredHasOffsets = hasOffsets;
@@ -172,6 +179,11 @@ final class PerFieldMergeState {
     }
 
     @Override
+    public boolean hasPostings() {
+      return filteredHasPostings;
+    }
+
+    @Override
     public boolean hasProx() {
       return filteredHasProx;
     }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
index d183f62..6468897 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@@ -27,6 +27,7 @@ import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.ServiceLoader;
 import java.util.Set;
 import java.util.TreeMap;
@@ -185,6 +186,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
               new MergedIterator<>(
                   true,
                   Arrays.stream(mergeState.fieldsProducers)
+                      .filter(Objects::nonNull)
                       .map(FieldsProducer::iterator)
                       .toArray(Iterator[]::new));
       Map<PostingsFormat, FieldsGroup> formatToGroups = buildFieldsGroupMapping(indexedFieldNames);
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index de5141a..f699937 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -44,6 +44,7 @@ import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.FieldsProducer;
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.PostingsFormat;
@@ -2407,7 +2408,12 @@ public final class CheckIndex implements Closeable {
         infoStream.print("    test: terms, freq, prox...");
       }
 
-      final Fields fields = reader.getPostingsReader().getMergeInstance();
+      FieldsProducer fields = reader.getPostingsReader();
+      if (fields != null) {
+        fields = fields.getMergeInstance();
+      } else {
+        return new Status.TermIndexStatus();
+      }
       final FieldInfos fieldInfos = reader.getFieldInfos();
       NormsProducer normsProducer = reader.getNormsReader();
       if (normsProducer != null) {
@@ -3540,10 +3546,13 @@ public final class CheckIndex implements Closeable {
 
       final Bits liveDocs = reader.getLiveDocs();
 
-      final Fields postingsFields;
+      FieldsProducer postingsFields;
       // TODO: testTermsIndex
       if (doSlowChecks) {
-        postingsFields = reader.getPostingsReader().getMergeInstance();
+        postingsFields = reader.getPostingsReader();
+        if (postingsFields != null) {
+          postingsFields = postingsFields.getMergeInstance();
+        }
       } else {
         postingsFields = null;
       }
@@ -3597,6 +3606,10 @@ public final class CheckIndex implements Closeable {
                 final boolean postingsHasPayload = fieldInfo.hasPayloads();
                 final boolean vectorsHasPayload = terms.hasPayloads();
 
+                if (postingsFields == null) {
+                  throw new CheckIndexException(
+                      "vector field=" + field + " does not exist in postings; doc=" + j);
+                }
                 Terms postingsTerms = postingsFields.terms(field);
                 if (postingsTerms == null) {
                   throw new CheckIndexException(
diff --git a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
index f78f126..064cdbf 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
@@ -106,10 +106,14 @@ public abstract class CodecReader extends LeafReader {
   @Override
   public final Terms terms(String field) throws IOException {
     // ensureOpen(); no; getPostingsReader calls this
+    FieldsProducer fieldsProducer = getPostingsReader();
+    if (fieldsProducer == null) {
+      return null;
+    }
     // We could check the FieldInfo IndexOptions but there's no point since
     //   PostingsReader will simply return null for fields that don't exist or that have no terms
     // index.
-    return getPostingsReader().terms(field);
+    return fieldsProducer.terms(field);
   }
 
   // returns the FieldInfo that corresponds to the given field and type, or
@@ -241,7 +245,9 @@ public abstract class CodecReader extends LeafReader {
     ensureOpen();
 
     // terms/postings
-    getPostingsReader().checkIntegrity();
+    if (getPostingsReader() != null) {
+      getPostingsReader().checkIntegrity();
+    }
 
     // norms
     if (getNormsReader() != null) {
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
index 44fb021..76cf9bb 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -48,6 +48,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
   public static final FieldInfos EMPTY = new FieldInfos(new FieldInfo[0]);
 
   private final boolean hasFreq;
+  private final boolean hasPostings;
   private final boolean hasProx;
   private final boolean hasPayloads;
   private final boolean hasOffsets;
@@ -67,6 +68,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
   /** Constructs a new FieldInfos from an array of FieldInfo objects */
   public FieldInfos(FieldInfo[] infos) {
     boolean hasVectors = false;
+    boolean hasPostings = false;
     boolean hasProx = false;
     boolean hasPayloads = false;
     boolean hasOffsets = false;
@@ -112,6 +114,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
       }
 
       hasVectors |= info.hasVectors();
+      hasPostings |= info.getIndexOptions() != IndexOptions.NONE;
       hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
       hasFreq |= info.getIndexOptions() != IndexOptions.DOCS;
       hasOffsets |=
@@ -132,6 +135,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
     }
 
     this.hasVectors = hasVectors;
+    this.hasPostings = hasPostings;
     this.hasProx = hasProx;
     this.hasPayloads = hasPayloads;
     this.hasOffsets = hasOffsets;
@@ -200,6 +204,11 @@ public class FieldInfos implements Iterable<FieldInfo> {
     return hasFreq;
   }
 
+  /** Returns true if any fields have postings */
+  public boolean hasPostings() {
+    return hasPostings;
+  }
+
   /** Returns true if any fields have positions */
   public boolean hasProx() {
     return hasProx;
diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
index e417549..ca2775c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
@@ -99,6 +99,11 @@ final class FreqProxTermsWriter extends TermsHash {
       }
     }
 
+    if (!state.fieldInfos.hasPostings()) {
+      assert allFields.isEmpty();
+      return;
+    }
+
     // Sort by field name
     CollectionUtil.introSort(allFields);
 
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index 6bc7e76..faea6cb 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -132,7 +132,11 @@ public class MergeState {
         termVectorsReaders[i] = termVectorsReaders[i].getMergeInstance();
       }
 
-      fieldsProducers[i] = reader.getPostingsReader().getMergeInstance();
+      fieldsProducers[i] = reader.getPostingsReader();
+      if (fieldsProducers[i] != null) {
+        fieldsProducers[i] = fieldsProducers[i].getMergeInstance();
+      }
+
       pointsReaders[i] = reader.getPointsReader();
       if (pointsReaders[i] != null) {
         pointsReaders[i] = pointsReaders[i].getMergeInstance();
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java
index b8bb1d7..9357cf1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCommitInfo.java
@@ -244,7 +244,9 @@ public class SegmentCommitInfo {
     // updates) and then maybe even be able to remove LiveDocsFormat.files().
 
     // Must separately add any live docs files:
-    info.getCodec().liveDocsFormat().files(this, files);
+    if (hasDeletions()) {
+      info.getCodec().liveDocsFormat().files(this, files);
+    }
 
     // must separately add any field updates files
     for (Set<String> updatefiles : dvUpdatesFiles.values()) {
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
index 68de0f2..df733b1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
@@ -112,10 +112,14 @@ final class SegmentCoreReaders {
 
       final SegmentReadState segmentReadState =
           new SegmentReadState(cfsDir, si.info, coreFieldInfos, context);
-      final PostingsFormat format = codec.postingsFormat();
-      // Ask codec for its Fields
-      fields = format.fieldsProducer(segmentReadState);
-      assert fields != null;
+      if (coreFieldInfos.hasPostings()) {
+        final PostingsFormat format = codec.postingsFormat();
+        // Ask codec for its Fields
+        fields = format.fieldsProducer(segmentReadState);
+        assert fields != null;
+      } else {
+        fields = null;
+      }
       // ask codec for its Norms:
       // TODO: since we don't write any norms file if there are no norms,
       // kinda jaky to assume the codec handles the case of no norms file at all gracefully?!
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
index 689b772..805b5e5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
@@ -203,8 +203,10 @@ final class SegmentMerger {
         // Use the merge instance in order to reuse the same IndexInput for all terms
         normsMergeInstance = norms.getMergeInstance();
       }
-      try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
-        consumer.merge(mergeState, normsMergeInstance);
+      if (mergeState.mergeFieldInfos.hasPostings()) {
+        try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
+          consumer.merge(mergeState, normsMergeInstance);
+        }
       }
     }
   }
diff --git a/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec
new file mode 100644
index 0000000..8c7c0df
--- /dev/null
+++ b/lucene/core/src/test/META-INF/services/org.apache.lucene.codecs.Codec
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.lucene.codecs.TestMinimalCodec$MinimalCodec
+org.apache.lucene.codecs.TestMinimalCodec$MinimalCompoundCodec
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java b/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java
new file mode 100644
index 0000000..c3c4edb
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/TestMinimalCodec.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
+
+import java.io.IOException;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.tests.analysis.MockAnalyzer;
+import org.apache.lucene.tests.store.BaseDirectoryWrapper;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestUtil;
+
+/**
+ * Tests to ensure that {@link Codec}s won't need to implement all formats in case where only a
+ * small subset of Lucene's functionality is used.
+ */
+public class TestMinimalCodec extends LuceneTestCase {
+
+  public void testMinimalCodec() throws IOException {
+    runMinimalCodecTest(false);
+  }
+
+  public void testMinimalCompoundCodec() throws IOException {
+    runMinimalCodecTest(true);
+  }
+
+  private void runMinimalCodecTest(boolean useCompoundFile) throws IOException {
+    try (BaseDirectoryWrapper dir = newDirectory()) {
+      IndexWriterConfig writerConfig =
+          newIndexWriterConfig(new MockAnalyzer(random()))
+              .setCodec(useCompoundFile ? new MinimalCompoundCodec() : new MinimalCodec())
+              .setUseCompoundFile(useCompoundFile);
+      if (!useCompoundFile) {
+        writerConfig.getMergePolicy().setNoCFSRatio(0.0);
+        writerConfig.getMergePolicy().setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
+      }
+
+      try (IndexWriter writer = new IndexWriter(dir, writerConfig)) {
+        writer.addDocument(basicDocument());
+        writer.flush();
+        // create second segment
+        writer.addDocument(basicDocument());
+        writer.forceMerge(1); // test merges
+        if (randomBoolean()) {
+          writer.commit();
+        }
+
+        try (DirectoryReader reader = DirectoryReader.open(writer)) {
+          assertEquals(2, reader.numDocs());
+        }
+      }
+    }
+  }
+
+  /** returns a basic document with no indexed fields */
+  private static Document basicDocument() {
+    return new Document();
+  }
+
+  /** Minimal codec implementation for working with the most basic documents */
+  public static class MinimalCodec extends Codec {
+
+    protected final Codec wrappedCodec = TestUtil.getDefaultCodec();
+
+    public MinimalCodec() {
+      this("MinimalCodec");
+    }
+
+    protected MinimalCodec(String name) {
+      super(name);
+    }
+
+    @Override
+    public FieldInfosFormat fieldInfosFormat() {
+      return wrappedCodec.fieldInfosFormat();
+    }
+
+    @Override
+    public SegmentInfoFormat segmentInfoFormat() {
+      return wrappedCodec.segmentInfoFormat();
+    }
+
+    @Override
+    public CompoundFormat compoundFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public LiveDocsFormat liveDocsFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public StoredFieldsFormat storedFieldsFormat() {
+      // TODO: avoid calling this when no stored fields are written or read
+      return wrappedCodec.storedFieldsFormat();
+    }
+
+    @Override
+    public PostingsFormat postingsFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public DocValuesFormat docValuesFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public TermVectorsFormat termVectorsFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public NormsFormat normsFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public PointsFormat pointsFormat() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public KnnVectorsFormat knnVectorsFormat() {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  /**
+   * Minimal codec implementation for working with the most basic documents, supporting compound
+   * formats
+   */
+  public static class MinimalCompoundCodec extends MinimalCodec {
+    public MinimalCompoundCodec() {
+      super("MinimalCompoundCodec");
+    }
+
+    @Override
+    public CompoundFormat compoundFormat() {
+      return wrappedCodec.compoundFormat();
+    }
+  }
+}