You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/06/11 17:55:45 UTC

svn commit: r1348919 - in /lucene/dev/branches/branch_4x/lucene: core/src/java/org/apache/lucene/codecs/lucene40/ core/src/java/org/apache/lucene/codecs/pulsing/ core/src/test/org/apache/lucene/codecs/lucene40/ test-framework/src/java/org/apache/lucene...

Author: rmuir
Date: Mon Jun 11 15:55:44 2012
New Revision: 1348919

URL: http://svn.apache.org/viewvc?rev=1348919&view=rev
Log:
LUCENE-4129: add codecheader to .frq/.prx

Added:
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java   (with props)
Modified:
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java?rev=1348919&r1=1348918&r2=1348919&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java Mon Jun 11 15:55:44 2012
@@ -159,7 +159,8 @@ import org.apache.lucene.util.fst.FST; /
  * with the frequency of the term in that document (except when frequencies are
  * omitted: {@link IndexOptions#DOCS_ONLY}).</p>
  * <ul>
- *   <li>FreqFile (.frq) --&gt; &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li>
+ *   <li>FreqFile (.frq) --&gt; Header, &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
  *   <li>TermFreqs --&gt; &lt;TermFreq&gt; <sup>DocFreq</sup></li>
  *   <li>TermFreq --&gt; DocDelta[, Freq?]</li>
  *   <li>SkipData --&gt; &lt;&lt;SkipLevelLength, SkipLevel&gt;
@@ -232,7 +233,8 @@ import org.apache.lucene.util.fst.FST; /
  * anything into this file, and if all fields in the index omit positional data
  * then the .prx file will not exist.</p>
  * <ul>
- *   <li>ProxFile (.prx) --&gt; &lt;TermPositions&gt; <sup>TermCount</sup></li>
+ *   <li>ProxFile (.prx) --&gt; Header, &lt;TermPositions&gt; <sup>TermCount</sup></li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
  *   <li>TermPositions --&gt; &lt;Positions&gt; <sup>DocFreq</sup></li>
  *   <li>Positions --&gt; &lt;PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?&gt; <sup>Freq</sup></li>
  *   <li>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --&gt; {@link DataOutput#writeVInt VInt}</li>

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java?rev=1348919&r1=1348918&r2=1348919&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java Mon Jun 11 15:55:44 2012
@@ -37,6 +37,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IOUtils;
 
 /** 
  * Concrete class that reads the 4.0 frq/prox
@@ -58,29 +59,35 @@ public class Lucene40PostingsReader exte
   // private String segment;
 
   public Lucene40PostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix) throws IOException {
-    freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
+    boolean success = false;
+    IndexInput freqIn = null;
+    IndexInput proxIn = null;
+    try {
+      freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
                            ioContext);
-    // TODO: hasProx should (somehow!) become codec private,
-    // but it's tricky because 1) FIS.hasProx is global (it
-    // could be all fields that have prox are written by a
-    // different codec), 2) the field may have had prox in
-    // the past but all docs w/ that field were deleted.
-    // Really we'd need to init prxOut lazily on write, and
-    // then somewhere record that we actually wrote it so we
-    // know whether to open on read:
-    if (fieldInfos.hasProx()) {
-      boolean success = false;
-      try {
+      CodecUtil.checkHeader(freqIn, Lucene40PostingsWriter.FRQ_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
+      // TODO: hasProx should (somehow!) become codec private,
+      // but it's tricky because 1) FIS.hasProx is global (it
+      // could be all fields that have prox are written by a
+      // different codec), 2) the field may have had prox in
+      // the past but all docs w/ that field were deleted.
+      // Really we'd need to init prxOut lazily on write, and
+      // then somewhere record that we actually wrote it so we
+      // know whether to open on read:
+      if (fieldInfos.hasProx()) {
         proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION),
-                               ioContext);
-        success = true;
-      } finally {
-        if (!success) {
-          freqIn.close();
-        }
+                             ioContext);
+        CodecUtil.checkHeader(proxIn, Lucene40PostingsWriter.PRX_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
+      } else {
+        proxIn = null;
+      }
+      this.freqIn = freqIn;
+      this.proxIn = proxIn;
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(freqIn, proxIn);
       }
-    } else {
-      proxIn = null;
     }
   }
 
@@ -88,7 +95,7 @@ public class Lucene40PostingsReader exte
   public void init(IndexInput termsIn) throws IOException {
 
     // Make sure we are talking to the matching past writer
-    CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.CODEC,
+    CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.TERMS_CODEC,
       Lucene40PostingsWriter.VERSION_START, Lucene40PostingsWriter.VERSION_START);
 
     skipInterval = termsIn.readInt();

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java?rev=1348919&r1=1348918&r2=1348919&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java Mon Jun 11 15:55:44 2012
@@ -45,7 +45,9 @@ import org.apache.lucene.util.IOUtils;
  * @lucene.experimental 
  */
 public final class Lucene40PostingsWriter extends PostingsWriterBase {
-  final static String CODEC = "Lucene40PostingsWriter";
+  final static String TERMS_CODEC = "Lucene40PostingsWriterTerms";
+  final static String FRQ_CODEC = "Lucene40PostingsWriterFrq";
+  final static String PRX_CODEC = "Lucene40PostingsWriterPrx";
 
   //private static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
   
@@ -102,7 +104,9 @@ public final class Lucene40PostingsWrite
     String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
     freqOut = state.directory.createOutput(fileName, state.context);
     boolean success = false;
+    IndexOutput proxOut = null;
     try {
+      CodecUtil.writeHeader(freqOut, FRQ_CODEC, VERSION_CURRENT);
       // TODO: this is a best effort, if one of these fields has no postings
       // then we make an empty prx file, same as if we are wrapped in 
       // per-field postingsformat. maybe... we shouldn't
@@ -112,14 +116,16 @@ public final class Lucene40PostingsWrite
         // prox file
         fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
         proxOut = state.directory.createOutput(fileName, state.context);
+        CodecUtil.writeHeader(proxOut, PRX_CODEC, VERSION_CURRENT);
       } else {
         // Every field omits TF so we will write no prox file
         proxOut = null;
       }
+      this.proxOut = proxOut;
       success = true;
     } finally {
       if (!success) {
-        IOUtils.closeWhileHandlingException(freqOut);
+        IOUtils.closeWhileHandlingException(freqOut, proxOut);
       }
     }
 
@@ -135,7 +141,7 @@ public final class Lucene40PostingsWrite
   @Override
   public void start(IndexOutput termsOut) throws IOException {
     this.termsOut = termsOut;
-    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+    CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
     termsOut.writeInt(skipInterval);                // write skipInterval
     termsOut.writeInt(maxSkipLevels);               // write maxSkipLevels
     termsOut.writeInt(skipMinimum);                 // write skipMinimum

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java?rev=1348919&r1=1348918&r2=1348919&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java Mon Jun 11 15:55:44 2012
@@ -29,6 +29,7 @@ import org.apache.lucene.codecs.Postings
 import org.apache.lucene.codecs.PostingsWriterBase;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
 
 /** This postings format "inlines" the postings for terms that have
  *  low docFreq.  It wraps another postings format, which is used for
@@ -65,33 +66,39 @@ public abstract class PulsingPostingsFor
 
   @Override
   public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    PostingsWriterBase docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
+    PostingsWriterBase docsWriter = null;
 
     // Terms that have <= freqCutoff number of docs are
     // "pulsed" (inlined):
-    PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
+    PostingsWriterBase pulsingWriter = null;
 
     // Terms dict
     boolean success = false;
     try {
+      docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
+
+      // Terms that have <= freqCutoff number of docs are
+      // "pulsed" (inlined):
+      pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
       FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
       success = true;
       return ret;
     } finally {
       if (!success) {
-        pulsingWriter.close();
+        IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
       }
     }
   }
 
   @Override
   public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-
-    PostingsReaderBase docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
-    PostingsReaderBase pulsingReader = new PulsingPostingsReader(docsReader);
+    PostingsReaderBase docsReader = null;
+    PostingsReaderBase pulsingReader = null;
 
     boolean success = false;
     try {
+      docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
+      pulsingReader = new PulsingPostingsReader(docsReader);
       FieldsProducer ret = new BlockTreeTermsReader(
                                                     state.dir, state.fieldInfos, state.segmentInfo.name,
                                                     pulsingReader,
@@ -102,7 +109,7 @@ public abstract class PulsingPostingsFor
       return ret;
     } finally {
       if (!success) {
-        pulsingReader.close();
+        IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
       }
     }
   }

Added: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java?rev=1348919&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java Mon Jun 11 15:55:44 2012
@@ -0,0 +1,98 @@
+package org.apache.lucene.codecs.lucene40;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.store.CompoundFileDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Test that a plain Lucene40Codec puts codec headers in all files.
+ */
+public class TestAllFilesHaveCodecHeader extends LuceneTestCase {
+  public void test() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    conf.setCodec(Codec.forName("Lucene40"));
+    // riw should sometimes create docvalues fields, etc
+    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf);
+    Document doc = new Document();
+    // these fields should sometimes get term vectors, etc
+    Field idField = newStringField("id", "", Field.Store.NO);
+    Field bodyField = newTextField("body", "", Field.Store.NO);
+    doc.add(idField);
+    doc.add(bodyField);
+    for (int i = 0; i < 100; i++) {
+      idField.setStringValue(Integer.toString(i));
+      bodyField.setStringValue(_TestUtil.randomUnicodeString(random()));
+      riw.addDocument(doc);
+      if (random().nextInt(7) == 0) {
+        riw.commit();
+      }
+    }
+    riw.close();
+    checkHeaders(dir);
+    dir.close();
+  }
+  
+  private void checkHeaders(Directory dir) throws IOException {
+    for (String file : dir.listAll()) {
+      if (file.equals(IndexFileNames.SEGMENTS_GEN)) {
+        continue; // segments.gen has no header, thats ok
+      }
+      if (file.endsWith(IndexFileNames.COMPOUND_FILE_EXTENSION)) {
+        /* TODO: enable this after resolving LUCENE-4130
+         * CompoundFileDirectory cfsDir = new CompoundFileDirectory(dir, file, newIOContext(random()), false);
+         * checkHeaders(cfsDir); // recurse into cfs
+         * cfsDir.close();
+         */
+        continue; // .cfs has its own header... would be nice to fix
+      }
+      if (file.endsWith(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION)) {
+        continue; // .cfe has its own header... would be nice to fix
+      }
+      IndexInput in = null;
+      boolean success = false;
+      try {
+        in = dir.openInput(file, newIOContext(random()));
+        int val = in.readInt();
+        assertEquals(file + " has no codec header, instead found: " + val, CodecUtil.CODEC_MAGIC, val);
+        success = true;
+      } finally {
+        if (success) {
+          IOUtils.close(in);
+        } else {
+          IOUtils.closeWhileHandlingException(in);
+        }
+      }
+    }
+  }
+}

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java?rev=1348919&r1=1348918&r2=1348919&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java Mon Jun 11 15:55:44 2012
@@ -32,6 +32,7 @@ import org.apache.lucene.codecs.pulsing.
 import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
 
 /**
  * Pulsing(1, Pulsing(2, Lucene40))
@@ -47,32 +48,38 @@ public class NestedPulsingPostingsFormat
   
   @Override
   public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    PostingsWriterBase docsWriter = new Lucene40PostingsWriter(state);
-
-    PostingsWriterBase pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
-    PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
+    PostingsWriterBase docsWriter = null;
+    PostingsWriterBase pulsingWriterInner = null;
+    PostingsWriterBase pulsingWriter = null;
     
     // Terms dict
     boolean success = false;
     try {
+      docsWriter = new Lucene40PostingsWriter(state);
+
+      pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
+      pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
       FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, 
           BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
       success = true;
       return ret;
     } finally {
       if (!success) {
-        pulsingWriter.close();
+        IOUtils.closeWhileHandlingException(docsWriter, pulsingWriterInner, pulsingWriter);
       }
     }
   }
 
   @Override
   public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-    PostingsReaderBase docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
-    PostingsReaderBase pulsingReaderInner = new PulsingPostingsReader(docsReader);
-    PostingsReaderBase pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
+    PostingsReaderBase docsReader = null;
+    PostingsReaderBase pulsingReaderInner = null;
+    PostingsReaderBase pulsingReader = null;
     boolean success = false;
     try {
+      docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
+      pulsingReaderInner = new PulsingPostingsReader(docsReader);
+      pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
       FieldsProducer ret = new BlockTreeTermsReader(
                                                     state.dir, state.fieldInfos, state.segmentInfo.name,
                                                     pulsingReader,
@@ -83,7 +90,7 @@ public class NestedPulsingPostingsFormat
       return ret;
     } finally {
       if (!success) {
-        pulsingReader.close();
+        IOUtils.closeWhileHandlingException(docsReader, pulsingReaderInner, pulsingReader);
       }
     }
   }