You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/04/25 23:54:49 UTC
svn commit: r1330577 - in /lucene/dev/trunk/lucene/core/src/java/org/apache/lucene: codecs/SegmentInfosFormat.java codecs/lucene40/Lucene40SegmentInfosFormat.java index/SegmentInfos.java store/DataOutput.java

Author: rmuir
Date: Wed Apr 25 21:54:49 2012
New Revision: 1330577

URL: http://svn.apache.org/viewvc?rev=1330577&view=rev
Log:
LUCENE-2946: doc 4.0 segments file format

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java Wed Apr 25 21:54:49 2012
@@ -17,11 +17,16 @@ package org.apache.lucene.codecs;
  * limitations under the License.
  */
 
+import org.apache.lucene.index.SegmentInfos; // javadocs
+
 /**
- * Expert: Controls the format of the segments file.
- * Note, this isn't a per-segment file, if you change the format, other versions
- * of lucene won't be able to read it, yackedy schmackedy
+ * Expert: Controls the format of the 
+ * {@link SegmentInfos} (segments file).
+ * <p>
+ * NOTE: This isn't a per-segment file. If you change the format, other versions
+ * of lucene won't be able to read it.
  * 
+ * @see SegmentInfos
  * @lucene.experimental
  */
 // TODO: would be great to handle this situation better.

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java Wed Apr 25 21:54:49 2012
@@ -1,8 +1,16 @@
 package org.apache.lucene.codecs.lucene40;
 
+import org.apache.lucene.codecs.Codec; // javadocs
+import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
 import org.apache.lucene.codecs.SegmentInfosFormat;
 import org.apache.lucene.codecs.SegmentInfosReader;
 import org.apache.lucene.codecs.SegmentInfosWriter;
+import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
+import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
+import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
+import org.apache.lucene.index.IndexWriter; // javadocs
+import org.apache.lucene.index.SegmentInfos; // javadocs
+import org.apache.lucene.store.DataOutput; // javadocs
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,6 +30,90 @@ import org.apache.lucene.codecs.SegmentI
  */
 
 /**
+ * Lucene 4.0 Segments format.
+ * <p>
+ * Files:
+ * <ul>
+ *   <li><tt>segments.gen</tt>: described in {@link SegmentInfos}
+ *   <li><tt>segments_N</tt>: Format, Codec, Version, NameCounter, SegCount,
+ *    &lt;SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment,
+ *    DocStoreIsCompoundFile], NumField, NormGen<sup>NumField</sup>, 
+ *    IsCompoundFile, DeletionCount, HasProx, SegCodec Diagnostics, 
+ *    HasVectors&gt;<sup>SegCount</sup>, CommitUserData, Checksum
+ * </ul>
+ * </p>
+ * Data types:
+ * <p>
+ * <ul>
+ *   <li>Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset,
+ *       DeletionCount --&gt; {@link DataOutput#writeInt Int32}</li>
+ *   <li>Version, DelGen, NormGen, Checksum --&gt; 
+ *       {@link DataOutput#writeLong Int64}</li>
+ *   <li>SegVersion, SegName, DocStoreSegment, Codec, SegCodec --&gt; 
+ *       {@link DataOutput#writeString String}</li>
+ *   <li>Diagnostics, CommitUserData --&gt; 
+ *       {@link DataOutput#writeStringStringMap Map&lt;String,String&gt;}</li>
+ *   <li>IsCompoundFile, DocStoreIsCompoundFile, HasProx,
+ *       HasVectors --&gt; {@link DataOutput#writeByte Int8}</li>
+ * </ul>
+ * </p>
+ * Field Descriptions:
+ * <p>
+ * <ul>
+ *   <li>Format is {@link SegmentInfos#FORMAT_4_0}.</li>
+ *   <li>Codec is "Lucene40", its the {@link Codec} that wrote this particular segments file.</li>
+ *   <li>Version counts how often the index has been changed by adding or deleting
+ *       documents.</li>
+ *   <li>NameCounter is used to generate names for new segment files.</li>
+ *   <li>SegVersion is the code version that created the segment.</li>
+ *   <li>SegName is the name of the segment, and is used as the file name prefix for
+ *       all of the files that compose the segment's index.</li>
+ *   <li>SegSize is the number of documents contained in the segment index.</li>
+ *   <li>DelGen is the generation count of the deletes file. If this is -1,
+ *       there are no deletes. Anything above zero means there are deletes 
+ *       stored by {@link LiveDocsFormat}.</li>
+ *   <li>NumField is the size of the array for NormGen, or -1 if there are no
+ *       NormGens stored.</li>
+ *   <li>NormGen records the generation of the separate norms files. If NumField is
+ *       -1, there are no normGens stored and all assumed to be -1. The generation 
+ *       then has the same meaning as delGen (above).</li>
+ *   <li>IsCompoundFile records whether the segment is written as a compound file or
+ *       not. If this is -1, the segment is not a compound file. If it is 1, the segment
+ *       is a compound file. Else it is 0, which means we check filesystem to see if
+ *       _X.cfs exists.</li>
+ *   <li>DocStoreOffset, DocStoreSegment, DocStoreIsCompoundFile: If DocStoreOffset
+ *       is -1, this segment has its own doc store (stored fields values and term
+ *       vectors) files and DocStoreSegment and DocStoreIsCompoundFile are not stored.
+ *       In this case all files for  {@link StoredFieldsFormat stored field values} and
+ *       {@link TermVectorsFormat term vectors} will be stored with this segment. 
+ *       Otherwise, DocStoreSegment is the name of the segment that has the shared doc 
+ *       store files; DocStoreIsCompoundFile is 1 if that segment is stored in compound 
+ *       file format (as a <tt>.cfx</tt> file); and DocStoreOffset is the starting document 
+ *       in the shared doc store files where this segment's documents begin. In this case, 
+ *       this segment does not store its own doc store files but instead shares a single 
+ *       set of these files with other segments.</li>
+ *   <li>Checksum contains the CRC32 checksum of all bytes in the segments_N file up
+ *       until the checksum. This is used to verify integrity of the file on opening the
+ *       index.</li>
+ *   <li>DeletionCount records the number of deleted documents in this segment.</li>
+ *   <li>HasProx is 1 if any fields in this segment have position data
+ *       ({@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS DOCS_AND_FREQS_AND_POSITIONS} or 
+ *       {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}); 
+ *       else, it's 0.</li>
+ *   <li>SegCodec is the {@link Codec#getName() name} of the Codec that encoded
+ *       this segment.</li>
+ *   <li>CommitUserData stores an optional user-supplied opaque
+ *       Map&lt;String,String&gt; that was passed to {@link IndexWriter#commit(java.util.Map)} 
+ *       or {@link IndexWriter#prepareCommit(java.util.Map)}.</li>
+ *   <li>The Diagnostics Map is privately written by IndexWriter, as a debugging aid,
+ *       for each segment it creates. It includes metadata like the current Lucene
+ *       version, OS, Java version, why the segment was created (merge, flush,
+ *       addIndexes), etc.</li>
+ *   <li>HasVectors is 1 if this segment stores term vectors, else it's 0.</li>
+ * </ul>
+ * </p>
+ * 
+ * @see SegmentInfos
  * @lucene.experimental
  */
 public class Lucene40SegmentInfosFormat extends SegmentInfosFormat {

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java Wed Apr 25 21:54:49 2012
@@ -38,6 +38,7 @@ import org.apache.lucene.index.FieldInfo
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.DataOutput; // javadocs
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.NoSuchDirectoryException;
@@ -47,6 +48,25 @@ import org.apache.lucene.util.ThreadInte
 /**
  * A collection of segmentInfo objects with methods for operating on
  * those segments in relation to the file system.
+ * <p>
+ * The active segments in the index are stored in the segment info file,
+ * <tt>segments_N</tt>. There may be one or more <tt>segments_N</tt> files in the
+ * index; however, the one with the largest generation is the active one (when
+ * older segments_N files are present it's because they temporarily cannot be
+ * deleted, or, a writer is in the process of committing, or a custom 
+ * {@link org.apache.lucene.index.IndexDeletionPolicy IndexDeletionPolicy}
+ * is in use). This file lists each segment by name, has details about the
+ * separate norms and deletion files, and also contains the size of each
+ * segment.
+ * </p>
+ * <p>There is also a file <tt>segments.gen</tt>. This file contains
+ * the current generation (the <tt>_N</tt> in <tt>segments_N</tt>) of the index.
+ * This is used only as a fallback in case the current generation cannot be
+ * accurately determined by directory listing alone (as is the case for some NFS
+ * clients with time-based directory cache expiration). This file simply contains
+ * an {@link DataOutput#writeInt Int32} version header 
+ * ({@link #FORMAT_SEGMENTS_GEN_CURRENT}), followed by the
+ * generation recorded as {@link DataOutput#writeLong Int64}, written twice.</p>
  * 
  * @lucene.experimental
  */

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java Wed Apr 25 21:54:49 2012
@@ -30,6 +30,11 @@ import org.apache.lucene.util.UnicodeUti
 public abstract class DataOutput {
 
   /** Writes a single byte.
+   * <p>
+   * The most primitive data type is an eight-bit byte. Files are 
+   * accessed as sequences of bytes. All other data types are defined 
+   * as sequences of bytes, so file formats are byte-order independent.
+   * 
    * @see IndexInput#readByte()
    */
   public abstract void writeByte(byte b) throws IOException;
@@ -52,6 +57,9 @@ public abstract class DataOutput {
   public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
 
   /** Writes an int as four bytes.
+   * <p>
+   * 32-bit unsigned integer written as four bytes, high-order bytes first.
+   * 
    * @see DataInput#readInt()
    */
   public void writeInt(int i) throws IOException {
@@ -184,6 +192,9 @@ public abstract class DataOutput {
   }
 
   /** Writes a long as eight bytes.
+   * <p>
+   * 64-bit unsigned integer written as eight bytes, high-order bytes first.
+   * 
    * @see DataInput#readLong()
    */
   public void writeLong(long i) throws IOException {
@@ -208,6 +219,10 @@ public abstract class DataOutput {
   }
 
   /** Writes a string.
+   * <p>
+   * Writes strings as UTF-8 encoded bytes. First the length, in bytes, is
+   * written as a {@link #writeVInt VInt}, followed by the bytes.
+   * 
    * @see DataInput#readString()
    */
   public void writeString(String s) throws IOException {
@@ -238,6 +253,15 @@ public abstract class DataOutput {
     }
   }
 
+  /**
+   * Writes a String map.
+   * <p>
+   * First the size is written as an {@link #writeInt(int) Int32},
+   * followed by each key-value pair written as two consecutive 
+   * {@link #writeString(String) String}s.
+   * 
+   * @param map Input map. May be null (equivalent to an empty map)
+   */
   public void writeStringStringMap(Map<String,String> map) throws IOException {
     if (map == null) {
       writeInt(0);