You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/04/25 23:54:49 UTC
svn commit: r1330577 - in
/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene:
codecs/SegmentInfosFormat.java
codecs/lucene40/Lucene40SegmentInfosFormat.java index/SegmentInfos.java
store/DataOutput.java
Author: rmuir
Date: Wed Apr 25 21:54:49 2012
New Revision: 1330577
URL: http://svn.apache.org/viewvc?rev=1330577&view=rev
Log:
LUCENE-2946: doc 4.0 segments file format
Modified:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/SegmentInfosFormat.java Wed Apr 25 21:54:49 2012
@@ -17,11 +17,16 @@ package org.apache.lucene.codecs;
* limitations under the License.
*/
+import org.apache.lucene.index.SegmentInfos; // javadocs
+
/**
- * Expert: Controls the format of the segments file.
- * Note, this isn't a per-segment file, if you change the format, other versions
- * of lucene won't be able to read it, yackedy schmackedy
+ * Expert: Controls the format of the
+ * {@link SegmentInfos} (segments file).
+ * <p>
+ * NOTE: This isn't a per-segment file. If you change the format, other versions
+ * of lucene won't be able to read it.
*
+ * @see SegmentInfos
* @lucene.experimental
*/
// TODO: would be great to handle this situation better.
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java Wed Apr 25 21:54:49 2012
@@ -1,8 +1,16 @@
package org.apache.lucene.codecs.lucene40;
+import org.apache.lucene.codecs.Codec; // javadocs
+import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
import org.apache.lucene.codecs.SegmentInfosFormat;
import org.apache.lucene.codecs.SegmentInfosReader;
import org.apache.lucene.codecs.SegmentInfosWriter;
+import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
+import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
+import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
+import org.apache.lucene.index.IndexWriter; // javadocs
+import org.apache.lucene.index.SegmentInfos; // javadocs
+import org.apache.lucene.store.DataOutput; // javadocs
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,6 +30,90 @@ import org.apache.lucene.codecs.SegmentI
*/
/**
+ * Lucene 4.0 Segments format.
+ * <p>
+ * Files:
+ * <ul>
+ * <li><tt>segments.gen</tt>: described in {@link SegmentInfos}
+ * <li><tt>segments_N</tt>: Format, Codec, Version, NameCounter, SegCount,
+ * <SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment,
+ * DocStoreIsCompoundFile], NumField, NormGen<sup>NumField</sup>,
+ * IsCompoundFile, DeletionCount, HasProx, SegCodec Diagnostics,
+ * HasVectors><sup>SegCount</sup>, CommitUserData, Checksum
+ * </ul>
+ * </p>
+ * Data types:
+ * <p>
+ * <ul>
+ * <li>Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset,
+ * DeletionCount --> {@link DataOutput#writeInt Int32}</li>
+ * <li>Version, DelGen, NormGen, Checksum -->
+ * {@link DataOutput#writeLong Int64}</li>
+ * <li>SegVersion, SegName, DocStoreSegment, Codec, SegCodec -->
+ * {@link DataOutput#writeString String}</li>
+ * <li>Diagnostics, CommitUserData -->
+ * {@link DataOutput#writeStringStringMap Map<String,String>}</li>
+ * <li>IsCompoundFile, DocStoreIsCompoundFile, HasProx,
+ * HasVectors --> {@link DataOutput#writeByte Int8}</li>
+ * </ul>
+ * </p>
+ * Field Descriptions:
+ * <p>
+ * <ul>
+ * <li>Format is {@link SegmentInfos#FORMAT_4_0}.</li>
+ * <li>Codec is "Lucene40", its the {@link Codec} that wrote this particular segments file.</li>
+ * <li>Version counts how often the index has been changed by adding or deleting
+ * documents.</li>
+ * <li>NameCounter is used to generate names for new segment files.</li>
+ * <li>SegVersion is the code version that created the segment.</li>
+ * <li>SegName is the name of the segment, and is used as the file name prefix for
+ * all of the files that compose the segment's index.</li>
+ * <li>SegSize is the number of documents contained in the segment index.</li>
+ * <li>DelGen is the generation count of the deletes file. If this is -1,
+ * there are no deletes. Anything above zero means there are deletes
+ * stored by {@link LiveDocsFormat}.</li>
+ * <li>NumField is the size of the array for NormGen, or -1 if there are no
+ * NormGens stored.</li>
+ * <li>NormGen records the generation of the separate norms files. If NumField is
+ * -1, there are no normGens stored and all assumed to be -1. The generation
+ * then has the same meaning as delGen (above).</li>
+ * <li>IsCompoundFile records whether the segment is written as a compound file or
+ * not. If this is -1, the segment is not a compound file. If it is 1, the segment
+ * is a compound file. Else it is 0, which means we check filesystem to see if
+ * _X.cfs exists.</li>
+ * <li>DocStoreOffset, DocStoreSegment, DocStoreIsCompoundFile: If DocStoreOffset
+ * is -1, this segment has its own doc store (stored fields values and term
+ * vectors) files and DocStoreSegment and DocStoreIsCompoundFile are not stored.
+ * In this case all files for {@link StoredFieldsFormat stored field values} and
+ * {@link TermVectorsFormat term vectors} will be stored with this segment.
+ * Otherwise, DocStoreSegment is the name of the segment that has the shared doc
+ * store files; DocStoreIsCompoundFile is 1 if that segment is stored in compound
+ * file format (as a <tt>.cfx</tt> file); and DocStoreOffset is the starting document
+ * in the shared doc store files where this segment's documents begin. In this case,
+ * this segment does not store its own doc store files but instead shares a single
+ * set of these files with other segments.</li>
+ * <li>Checksum contains the CRC32 checksum of all bytes in the segments_N file up
+ * until the checksum. This is used to verify integrity of the file on opening the
+ * index.</li>
+ * <li>DeletionCount records the number of deleted documents in this segment.</li>
+ * <li>HasProx is 1 if any fields in this segment have position data
+ * ({@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS DOCS_AND_FREQS_AND_POSITIONS} or
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS});
+ * else, it's 0.</li>
+ * <li>SegCodec is the {@link Codec#getName() name} of the Codec that encoded
+ * this segment.</li>
+ * <li>CommitUserData stores an optional user-supplied opaque
+ * Map<String,String> that was passed to {@link IndexWriter#commit(java.util.Map)}
+ * or {@link IndexWriter#prepareCommit(java.util.Map)}.</li>
+ * <li>The Diagnostics Map is privately written by IndexWriter, as a debugging aid,
+ * for each segment it creates. It includes metadata like the current Lucene
+ * version, OS, Java version, why the segment was created (merge, flush,
+ * addIndexes), etc.</li>
+ * <li>HasVectors is 1 if this segment stores term vectors, else it's 0.</li>
+ * </ul>
+ * </p>
+ *
+ * @see SegmentInfos
* @lucene.experimental
*/
public class Lucene40SegmentInfosFormat extends SegmentInfosFormat {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/SegmentInfos.java Wed Apr 25 21:54:49 2012
@@ -38,6 +38,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.NoSuchDirectoryException;
@@ -47,6 +48,25 @@ import org.apache.lucene.util.ThreadInte
/**
* A collection of segmentInfo objects with methods for operating on
* those segments in relation to the file system.
+ * <p>
+ * The active segments in the index are stored in the segment info file,
+ * <tt>segments_N</tt>. There may be one or more <tt>segments_N</tt> files in the
+ * index; however, the one with the largest generation is the active one (when
+ * older segments_N files are present it's because they temporarily cannot be
+ * deleted, or, a writer is in the process of committing, or a custom
+ * {@link org.apache.lucene.index.IndexDeletionPolicy IndexDeletionPolicy}
+ * is in use). This file lists each segment by name, has details about the
+ * separate norms and deletion files, and also contains the size of each
+ * segment.
+ * </p>
+ * <p>There is also a file <tt>segments.gen</tt>. This file contains
+ * the current generation (the <tt>_N</tt> in <tt>segments_N</tt>) of the index.
+ * This is used only as a fallback in case the current generation cannot be
+ * accurately determined by directory listing alone (as is the case for some NFS
+ * clients with time-based directory cache expiration). This file simply contains
+ * an {@link DataOutput#writeInt Int32} version header
+ * ({@link #FORMAT_SEGMENTS_GEN_CURRENT}), followed by the
+ * generation recorded as {@link DataOutput#writeLong Int64}, written twice.</p>
*
* @lucene.experimental
*/
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java?rev=1330577&r1=1330576&r2=1330577&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/store/DataOutput.java Wed Apr 25 21:54:49 2012
@@ -30,6 +30,11 @@ import org.apache.lucene.util.UnicodeUti
public abstract class DataOutput {
/** Writes a single byte.
+ * <p>
+ * The most primitive data type is an eight-bit byte. Files are
+ * accessed as sequences of bytes. All other data types are defined
+ * as sequences of bytes, so file formats are byte-order independent.
+ *
* @see IndexInput#readByte()
*/
public abstract void writeByte(byte b) throws IOException;
@@ -52,6 +57,9 @@ public abstract class DataOutput {
public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
/** Writes an int as four bytes.
+ * <p>
+ * 32-bit unsigned integer written as four bytes, high-order bytes first.
+ *
* @see DataInput#readInt()
*/
public void writeInt(int i) throws IOException {
@@ -184,6 +192,9 @@ public abstract class DataOutput {
}
/** Writes a long as eight bytes.
+ * <p>
+ * 64-bit unsigned integer written as eight bytes, high-order bytes first.
+ *
* @see DataInput#readLong()
*/
public void writeLong(long i) throws IOException {
@@ -208,6 +219,10 @@ public abstract class DataOutput {
}
/** Writes a string.
+ * <p>
+ * Writes strings as UTF-8 encoded bytes. First the length, in bytes, is
+ * written as a {@link #writeVInt VInt}, followed by the bytes.
+ *
* @see DataInput#readString()
*/
public void writeString(String s) throws IOException {
@@ -238,6 +253,15 @@ public abstract class DataOutput {
}
}
+ /**
+ * Writes a String map.
+ * <p>
+ * First the size is written as an {@link #writeInt(int) Int32},
+ * followed by each key-value pair written as two consecutive
+ * {@link #writeString(String) String}s.
+ *
+ * @param map Input map. May be null (equivalent to an empty map)
+ */
public void writeStringStringMap(Map<String,String> map) throws IOException {
if (map == null) {
writeInt(0);