You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/08/18 15:12:48 UTC
svn commit: r1374573 - in
/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs:
MultiLevelSkipListWriter.java block/BlockPostingsFormat.java
block/BlockPostingsReader.java lucene40/Lucene40PostingsFormat.java
Author: mikemccand
Date: Sat Aug 18 13:12:47 2012
New Revision: 1374573
URL: http://svn.apache.org/viewvc?rev=1374573&view=rev
Log:
LUCENE-3982: improve javadocs
Modified:
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java?rev=1374573&r1=1374572&r2=1374573&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListWriter.java Sat Aug 18 13:12:47 2012
@@ -26,6 +26,8 @@ import org.apache.lucene.util.MathUtil;
/**
* This abstract class writes skip lists with multiple levels.
*
+ * <pre>
+ *
* Example for skipInterval = 3:
* c (skip level 2)
* c c c (skip level 1)
@@ -45,6 +47,7 @@ import org.apache.lucene.util.MathUtil;
*
* While this class takes care of writing the different skip levels,
* subclasses must define the actual format of the skip data.
+ * </pre>
* @lucene.experimental
*/
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java?rev=1374573&r1=1374572&r2=1374573&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java Sat Aug 18 13:12:47 2012
@@ -30,10 +30,290 @@ import org.apache.lucene.index.SegmentRe
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
+// javadocs
+import org.apache.lucene.codecs.MultiLevelSkipListWriter;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.packed.PackedInts;
+
/**
- * Encodes/decode postings in packed int blocks for faster
- * decode.
+ * Block postings format, which encodes postings in packed int blocks
+ * for faster decode.
+ *
+ * <p>
+ * Basic idea:
+ * <ul>
+ * <li>
+ * <b>Packed Block and VInt Block</b>:
+ * <p>In packed block, integers are encoded with the same bit width ({@link PackedInts packed format}),
+ * the block size (i.e. number of integers inside block) is fixed. </p>
+ * <p>In VInt block, integers are encoded as {@link DataOutput#writeVInt VInt},
+ * the block size is variable.</p>
+ * </li>
+ *
+ * <li>
+ * <b>Block structure</b>:
+ * <p>When the postings is long enough, BlockPostingsFormat will try to encode most integer data
+ * as packed block.</p>
+ * <p>Take a term with 259 documents as example, the first 256 document ids are encoded as two packed
+ * blocks, while the remaining 3 as one VInt block. </p>
+ * <p>Different kinds of data are always encoded separately into different packed blocks, but may
+ * possible be encoded into a same VInt block. </p>
+ * <p>This strategy is applied to pairs:
+ * <document number, frequency>,
+ * <position, payload length>,
+ * <position, offset start, offset length>, and
+ * <position, payload length, offsetstart, offset length>.</p>
+ * </li>
+ *
+ * <li>
+ * <b>Skipper setting</b>:
+ * <p>The structure of skip table is quite similar to Lucene40PostingsFormat. Skip interval is the
+ * same as block size, and each skip entry points to the beginning of each block. However, for
+ * the first block, skip data is omitted.</p>
+ * </li>
+ *
+ * <li>
+ * <b>Positions, Payloads, and Offsets</b>:
+ * <p>A position is an integer indicating where the term occured in one document.
+ * A payload is a blob of metadata associated with current position.
+ * An offset is a pair of integers indicating the tokenized start/end offsets for given term
+ * in current position. </p>
+ * <p>When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets (assuming a
+ * null payload contributes one count). As mentioned in block structure, it is possible to encode
+ * these three either centralizedly or separately.
+ * <p>For all the cases, payloads and offsets are stored together. When encoded as packed block,
+ * position data is separated out as .pos, while payloads and offsets are encoded in .pay (payload
+ * metadata will also be stored directly in .pay). When encoded as VInt block, all these three are
+ * stored in .pos (so as payload metadata).</p>
+ * </li>
+ * </ul>
+ * </p>
+ *
+ * <p>
+ * Files and detailed format:
+ * <ul>
+ * <li><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
+ * <li><tt>.tip</tt>: <a href="#Termindex">Term Index</a></li>
+ * <li><tt>.doc</tt>: <a href="#Frequencies">Frequencies and Skip Data</a></li>
+ * <li><tt>.pos</tt>: <a href="#Positions">Positions</a></li>
+ * <li><tt>.pay</tt>: <a href="#Payloads">Payloads and Offsets</a></li>
+ * </ul>
+ * </p>
+ *
+ * <a name="Termdictionary" id="Termdictionary"></a>
+ * <dl>
+ * <dd>
+ * <b>Term Dictionary</b>
+ *
+ * <p>The .tim file format is quite similar to Lucene40PostingsFormat,
+ * with minor difference in MetadataBlock</p>
+ *
+ * <ul>
+ * <!-- TODO: expand on this, its not really correct and doesnt explain sub-blocks etc -->
+ * <li>TermDictionary(.tim) --> Header, DirOffset, PostingsHeader, PackedBlockSize,
+ * <Block><sup>NumBlocks</sup>, FieldSummary</li>
+ * <li>Block --> SuffixBlock, StatsBlock, MetadataBlock</li>
+ * <li>SuffixBlock --> EntryCount, SuffixLength, Byte<sup>SuffixLength</sup></li>
+ * <li>StatsBlock --> StatsLength, <DocFreq, TotalTermFreq><sup>EntryCount</sup></li>
+ * <li>MetadataBlock --> MetaLength, <DocFPDelta,
+ * <PosFPDelta, PosBlockFPDelta?, PayFPDelta?>?,
+ * SkipFPDelta? ><sup>EntryCount</sup></li>
+ * <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength,
+ * Byte<sup>RootCodeLength</sup>, SumDocFreq, DocCount>
+ * <sup>NumFields</sup></li>
+ * <li>Header, PostingsHeader --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
+ * <li>PackedBlockSize, EntryCount, SuffixLength, StatsLength, DocFreq, MetaLength,
+ * PosBlockFPDelta, SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount -->
+ * {@link DataOutput#writeVInt VInt}</li>
+ * <li>TotalTermFreq, DocFPDelta, PosFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq -->
+ * {@link DataOutput#writeVLong VLong}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>Here explains MetadataBlock only, other fields are mentioned in
+ * <!--NOTE: change this manual html link, when directory structure is changed. -->
+ * <a href="../../../../../org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.html#Termdictionary">Lucene40PostingsFormat:TermDictionary</a>
+ * </li>
+ * <li>PackedBlockSize is fixed block size for packed blocks. In packed block, bit width is
+ * determined by the largest integer. Smaller block size result in smaller variance among width
+ * of integers hence smaller indexes. Larger block size result in more efficient bulk i/o hence
+ * better acceleration. This value should always be a multiple of 64, currently fixed as 128 as
+ * a tradeoff. It is also the skip interval used to accerlerate {@link DocsEnum#advance(int)}.
+ * <li>DocFPDelta determines the position of this term's TermFreqs within the .doc file.
+ * In particular, it is the difference of file offset between this term's
+ * data and previous term's data (or zero, for the first term in the block).</li>
+ * <li>PayFPDelta determines the position of this term's payload or offset data within the .pay file.
+ * Similar to DocFPDelta, it is the difference between two file positions (or neglected,
+ * for fields that omit payloads and offsets, or for the first term in the block).</li>
+ * <!--TODO: not quite sure, what is the difference?-->
+ * <li>PosFPDelta and PosBlockFPDelta determine the position of this term's TermPositions within
+ * the .pos file.
+ * <li>PosBlockFPDelta determines the position of this term's TermPositions within the .pos file.
+ * Similar to DocFPDelta, it is the difference between two file positions (or neglected,
+ * for fields that omit position data, or for the first term in the block).</li>
+ * <li>SkipFPDelta determines the position of this term's SkipData within the .doc
+ * file. In particular, it is the number of bytes after TermFreqs that the
+ * SkipData starts. In other words, it is the length of the TermFreq data.
+ * SkipDelta is only stored if DocFreq is not smaller than SkipMinimum,
+ * (i.e. 8 in BlockPostingsFormat).</li>
+ * </ul>
+ * </dd>
+ * </dl>
+ *
+ * <a name="Termindex" id="Termindex"></a>
+ * <dl>
+ * <dd>
+ * <b>Term Index</b>
+ * <p>The .tim file format is mentioned in
+ * <!--NOTE: change this manual html link, when directory structure is changed. -->
+ * <a href="../../../../../org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.html#Termindex">Lucene40PostingsFormat:TermIndex</a>
+ * </dd>
+ * </dl>
+ *
+ *
+ * <a name="Frequencies" id="Frequencies"></a>
+ * <dl>
+ * <dd>
+ * <b>Frequencies and Skip Data</b>
+ *
+ * <p>The .doc file contains the lists of documents which contain each term, along
+ * with the frequency of the term in that document (except when frequencies are
+ * omitted: {@link IndexOptions#DOCS_ONLY}). It also saves skip data to the beginning of
+ * each packed or VInt block, when the length of document list is larger than packed block size.</p>
+ *
+ * <ul>
+ * <li>docFile(.doc) --> Header, < TermFreqs, SkipData? ><sup>TermCount</sup></li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>TermFreqs --> < PackedBlock > <sup>PackedDocBlockNum</sup>,
+ * VIntBlock? </li>
+ * <li>PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock?
+ * <li>VIntBlock --> < DocDelta[, Freq?] ><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
+ * <li>SkipData --> <<SkipLevelLength, SkipLevel>
+ * <sup>NumSkipLevels-1</sup>, SkipLevel> <SkipDatum?></li>
+ * <li>SkipLevel --> <SkipDatum> <sup>TrimmedDocFreq/(PackedBlockSize^(Level + 1))</sup></li>
+ * <li>SkipDatum --> DocSkip, DocFPSkip, < PosFPSkip, PosBlockOffset, PayLength?,
+ * OffsetStart?, PayFPSkip? >?, SkipChildLevelPointer?</li>
+ * <li>PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}</li>
+ * <li>DocDelta,Freq,DocSkip,DocFPSkip,PosFPSkip,PosBlockOffset,PayLength,OffsetStart,PayFPSkip -->
+ * {@link DataOutput#writeVInt VInt}</li>
+ * <li>SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>PackedDocDeltaBlock is theoretically generated from two steps:
+ * <ol>
+ * <li>Calculate the difference between each document number and previous one,
+ * and get a d-gaps list (for the first document, use absolute value); </li>
+ * <li>For those d-gaps from first one to PackedDocBlockNum*PackedBlockSize<sup>th</sup>,
+ * seperately encode as packed blocks.</li>
+ * </ol>
+ * If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
+ * </li>
+ * <li>VIntBlock stores remaining d-gaps (along with frequencies when possible) with a format
+ * mentioned in
+ * <!--NOTE: change this manual html link, when directory structure is changed. -->
+ * <a href="../../../../../org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.html#Frequencies">Lucene40PostingsFormat:Frequencies</a>
+ * </li>
+ * <li>PackedDocBlockNum is the number of packed blocks for current term's docids or frequencies.
+ * In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) </li>
+ * <li>TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq.
+ * We use this trick since the definition of skip entry is a little different from base interface.
+ * In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
+ * skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
+ * in BlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
+ * 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
+ * When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
+ * more skip data than BlockSkipWriter. </li>
+ * <li>SkipDatum is the metadata of one skip entry.
+ * For the first block (no matter packed or VInt), it is omitted.</li>
+ * <li>DocSkip records the document number of every PackedBlockSize<sup>th</sup> document number in
+ * the postings(i.e. last document number in each packed block). On disk it is stored as the
+ * difference from previous value in the sequence. </li>
+ * <li>DocFPSkip records the file offsets of each block (excluding )posting at
+ * PackedBlockSize+1<sup>th</sup>, 2*PackedBlockSize+1<sup>th</sup> ... , in DocFile.
+ * The file offsets are relative to the start of current term's TermFreqs.
+ * On disk it is also stored as the difference from previous SkipDatum in the sequence.</li>
+ * <li>Since positions and payloads are also block encoded, the skip should skip to related block first,
+ * then fetch the values according to in-block offset. PosFPSkip and PayFPSkip record the file
+ * offsets of related block in .pos and .pay, respectively. While PosBlockOffset indicates
+ * which value to fetch inside the related block (PayBlockOffset is unnecessary since it is always
+ * equal to PosBlockOffset). Same as DocFPSkip, the file offsets are relative to the start of
+ * current term's TermFreqs, and stored as a difference sequence.</li>
+ * <li>PayLength indicates the length of last payload.</li>
+ * <li>OffsetStart indicates the first value of last offset pair.</li>
+ * </ul>
+ * </dd>
+ * </dl>
+ *
+ * <a name="Positions" id="Positions"></a>
+ * <dl>
+ * <dd>
+ * <b>Positions</b>
+ * <ul>
+ * <li>Pos(.prx) --> Header, <TermPositions> <sup>TermCount</sup></li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>TermPositions --> < PackedPosDeltaBlock > <sup>PackedPosBlockNum</sup>,
+ * VIntBlock? </li>
+ * <li>VIntBlock --> PosVIntCount < PosDelta[, PayLength?], PayData?,
+ * OffsetStartDelta?, OffsetLength? ><sup>PosVIntCount</sup>
+ * <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}</li>
+ * <li>PosVIntCount, PosDelta, OffsetStartDelta, OffsetLength -->
+ * {@link DataOutput#writeVInt VInt}</li>
+ * <li>PayData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position
+ * values for each term document pair are incremental, and ordered by document number.</li>
+ * <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
+ * in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li>
+ * <li>PosDelta is the same as the format mentioned in
+ * <!--NOTE: change this manual html link, when directory structure is changed. -->
+ * <a href="../../../../../org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.html#Positions">Lucene40PostingsFormat:Positions</a>
+ * </li>
+ * <li>OffsetStartDelta is the difference between this position's startOffset from the previous
+ * occurrence (or zero, if this is the first occurrence in this document).</li>
+ * <li>OffsetLength indicates the length of the current offset (endOffset-startOffset).</li>
+ * <li>PayloadData is the blob of metadata associated with current position.</li>
+ * </ul>
+ * </dd>
+ * </dl>
+ *
+ * <a name="Payloads" id="Payloads"></a>
+ * <dl>
+ * <dd>
+ * <b>Payloads and Offsets</b>
+ * <ul>
+ * <li>PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> <sup>TermCount</sup></li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>TermPayloads --> < PackedPayLengthBlock, PayBlockLength, PayData, PackedOffsetStartDeltaBlock?, PackedOffsetLengthBlock > <sup>PackedPayBlockNum</sup>
+ * <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}</li>
+ * <li>PayBlockLength --> {@link DataOutput#writeVInt VInt}</li>
+ * <li>PayData --> {@link DataOutput#writeByte byte}<sup>PayBlockLength</sup></li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>The order of TermPayloads/TermOffsets will be the same as TermPositions, note that part of
+ * payload/offsets are stored in .pos.</li>
+ * <li>The procedure how PackedPayLengthBlock is generated is the same as PackedFreqBlock
+ * in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li>
+ * <li>PayBlockLength is the total length of payloads written within one block, should be the sum
+ * of PayLengths in one packed block.</li>
+ * <li>PayLength is the length of each payload, associated with current position.</li>
+ * </u>
+ * </dd>
+ * </dl>
+ * </p>
+ *
*/
+
public final class BlockPostingsFormat extends PostingsFormat {
public static final String DOC_EXTENSION = "doc";
public static final String POS_EXTENSION = "pos";
@@ -42,7 +322,7 @@ public final class BlockPostingsFormat e
private final int minTermBlockSize;
private final int maxTermBlockSize;
- // NOTE: must be factor of 64 because of PackedInts long-aligned encoding/decoding
+ // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
public final static int BLOCK_SIZE = 128;
public BlockPostingsFormat() {
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java?rev=1374573&r1=1374572&r2=1374573&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java Sat Aug 18 13:12:47 2012
@@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils;
* @see BlockSkipReader for details
*
*/
-public final class BlockPostingsReader extends PostingsReaderBase {
+final class BlockPostingsReader extends PostingsReaderBase {
private final IndexInput docIn;
private final IndexInput posIn;
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java?rev=1374573&r1=1374572&r2=1374573&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java Sat Aug 18 13:12:47 2012
@@ -159,7 +159,7 @@ import org.apache.lucene.util.fst.FST; /
* with the frequency of the term in that document (except when frequencies are
* omitted: {@link IndexOptions#DOCS_ONLY}).</p>
* <ul>
- * <li>FreqFile (.frq) --> Header, <TermFreqs, SkipData> <sup>TermCount</sup></li>
+ * <li>FreqFile (.frq) --> Header, <TermFreqs, SkipData?> <sup>TermCount</sup></li>
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermFreqs --> <TermFreq> <sup>DocFreq</sup></li>
* <li>TermFreq --> DocDelta[, Freq?]</li>