You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/04/27 17:12:38 UTC
svn commit: r1331464 - in
/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene:
codecs/lucene40/Lucene40LiveDocsFormat.java util/CodecUtil.java
Author: rmuir
Date: Fri Apr 27 15:12:37 2012
New Revision: 1331464
URL: http://svn.apache.org/viewvc?rev=1331464&view=rev
Log:
LUCENE-2946: doc 4.0 livedocs format
Modified:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java?rev=1331464&r1=1331463&r2=1331464&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40LiveDocsFormat.java Fri Apr 27 15:12:37 2012
@@ -23,11 +23,45 @@ import java.util.Set;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CodecUtil; // javadocs
import org.apache.lucene.util.MutableBits;
+/**
+ * Lucene 4.0 Live Documents Format.
+ * <p>
+ * <p>The .del file is optional, and only exists when a segment contains
+ * deletions.</p>
+ * <p>Although per-segment, this file is maintained exterior to compound segment
+ * files.</p>
+ * <p>Deletions (.del) --> Format,Header,ByteCount,BitCount, Bits | DGaps (depending
+ * on Format)</p>
+ * <ul>
+ * <li>Format,ByteSize,BitCount --> {@link DataOutput#writeInt Uint32}</li>
+ * <li>Bits --> <{@link DataOutput#writeByte Byte}> <sup>ByteCount</sup></li>
+ * <li>DGaps --> <DGap,NonOnesByte> <sup>NonzeroBytesCount</sup></li>
+ * <li>DGap --> {@link DataOutput#writeVInt VInt}</li>
+ * <li>NonOnesByte --> {@link DataOutput#writeByte Byte}</li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * </ul>
+ * <p>Format is 1: indicates cleared DGaps.</p>
+ * <p>ByteCount indicates the number of bytes in Bits. It is typically
+ * (SegSize/8)+1.</p>
+ * <p>BitCount indicates the number of bits that are currently set in Bits.</p>
+ * <p>Bits contains one bit for each document indexed. When the bit corresponding
+ * to a document number is cleared, that document is marked as deleted. Bit ordering
+ * is from least to most significant. Thus, if Bits contains two bytes, 0x00 and
+ * 0x02, then document 9 is marked as alive (not deleted).</p>
+ * <p>DGaps represents sparse bit-vectors more efficiently than Bits. It is made
+ * of DGaps on indexes of nonOnes bytes in Bits, and the nonOnes bytes themselves.
+ * The number of nonOnes bytes in Bits (NonOnesBytesCount) is not stored.</p>
+ * <p>For example, if there are 8000 bits and only bits 10,12,32 are cleared, DGaps
+ * would be used:</p>
+ * <p>(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1</p>
+ */
public class Lucene40LiveDocsFormat extends LiveDocsFormat {
/** Extension of deletes */
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java?rev=1331464&r1=1331463&r2=1331464&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/CodecUtil.java Fri Apr 27 15:12:37 2012
@@ -28,16 +28,47 @@ import org.apache.lucene.store.DataOutpu
/**
* Utility class for reading and writing versioned headers.
- * This is useful to ensure that a file is in the format
- * you think it is.
+ * <p>
+ * Writing codec headers is useful to ensure that a file is in
+ * the format you think it is.
+ *
* @lucene.experimental
*/
public final class CodecUtil {
private CodecUtil() {} // no instance
- private final static int CODEC_MAGIC = 0x3fd76c17;
-
+ /**
+ * Constant to identify the start of a codec header.
+ */
+ public final static int CODEC_MAGIC = 0x3fd76c17;
+
+ /**
+ * Writes a codec header, which records both a string to
+ * identify the file and a version number. This header can
+ * be parsed and validated with
+ * {@link #checkHeader(DataInput, String, int, int) checkHeader()}.
+ * <p>
+ * CodecHeader --> Magic,CodecName,Version
+ * <ul>
+ * <li>Magic --> {@link DataOutput#writeInt Uint32}. This
+ * identifies the start of the header. It is always {@value #CODEC_MAGIC}.
+ * <li>CodecName --> {@link DataOutput#writeString String}. This
+ * is a string to identify this file.
+ * <li>Version --> {@link DataOutput#writeInt Uint32}. Records
+ * the version of the file.
+ * </ul>
+ * <p>
+ * Note that the length of a codec header depends only upon the
+ * name of the codec, so this length can be computed at any time
+ * with {@link #headerLength(String)}.
+ *
+ * @param out Output stream
+ * @param codec String to identify this file. It should be simple ASCII,
+ * less than 128 characters in length.
+ * @param version Version number
+ * @throws IOException If there is an I/O error writing to the underlying medium.
+ */
public static void writeHeader(DataOutput out, String codec, int version)
throws IOException {
BytesRef bytes = new BytesRef(codec);
@@ -49,10 +80,44 @@ public final class CodecUtil {
out.writeInt(version);
}
+ /**
+ * Computes the length of a codec header.
+ *
+ * @param codec Codec name.
+ * @return length of the entire codec header.
+ * @see #writeHeader(DataOutput, String, int)
+ */
public static int headerLength(String codec) {
return 9+codec.length();
}
+ /**
+ * Reads and validates a header previously written with
+ * {@link #writeHeader(DataOutput, String, int)}.
+ * <p>
+ * When reading a file, supply the expected <code>codec</code> and
+ * an expected version range (<code>minVersion to maxVersion</code>).
+ *
+ * @param in Input stream, positioned at the point where the
+ * header was previously written. Typically this is located
+ * at the beginning of the file.
+ * @param codec The expected codec name.
+ * @param minVersion The minimum supported expected version number.
+ * @param maxVersion The maximum supported expected version number.
+ * @return The actual version found, when a valid header is found
+ * that matches <code>codec</code>, with an actual version
+ * where <code>minVersion <= actual <= maxVersion</code>.
+ * Otherwise an exception is thrown.
+ * @throws CorruptIndexException If the first four bytes are not
+ * {@link #CODEC_MAGIC}, or if the actual codec found is
+ * not <code>codec</code>.
+ * @throws IndexFormatTooOldException If the actual version is less
+ * than <code>minVersion</code>.
+ * @throws IndexFormatTooNewException If the actual version is greater
+ * than <code>maxVersion</code>.
+ * @throws IOException If there is an I/O error reading from the underlying medium.
+ * @see #writeHeader(DataOutput, String, int)
+ */
public static int checkHeader(DataInput in, String codec, int minVersion, int maxVersion)
throws IOException {