You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/08/18 19:59:17 UTC
svn commit: r1374620 - in
/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block:
BlockPostingsFormat.java BlockPostingsReader.java BlockPostingsWriter.java
Author: mikemccand
Date: Sat Aug 18 17:59:17 2012
New Revision: 1374620
URL: http://svn.apache.org/viewvc?rev=1374620&view=rev
Log:
LUCENE-3892: javadocs
Modified:
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java?rev=1374620&r1=1374619&r2=1374620&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java Sat Aug 18 17:59:17 2012
@@ -129,14 +129,14 @@ import org.apache.lucene.util.packed.Pac
* <PosFPDelta, PosVIntBlockFPDelta?, PayFPDelta?>?,
* SkipFPDelta?><sup>EntryCount</sup></li>
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength,
- * Byte<sup>RootCodeLength</sup>, SumDocFreq, DocCount>
+ * {@link DataOutput#writeByte byte}<sup>RootCodeLength</sup>, SumDocFreq, DocCount>
* <sup>NumFields</sup></li>
* <li>Header, PostingsHeader --> {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
* <li>PackedBlockSize, EntryCount, SuffixLength, StatsLength, DocFreq, MetaLength,
- * PosVIntBlockFPDelta , SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount -->
+ * PosVIntBlockFPDelta, SkipFPDelta, NumFields, FieldNumber, RootCodeLength, DocCount -->
* {@link DataOutput#writeVInt VInt}</li>
- * <li>TotalTermFreq, DocFPDelta, PosFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq -->
+ * <li>TotalTermFreq, DocFPDelta, PosFPDelta, PayFPDelta, NumTerms, SumTotalTermFreq, SumDocFreq -->
* {@link DataOutput#writeVLong VLong}</li>
* </ul>
* <p>Notes:</p>
@@ -203,7 +203,7 @@ import org.apache.lucene.util.packed.Pac
* <li>PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock?
* <li>VIntBlock --> <DocDelta[, Freq?]><sup>DocFreq-PackedBlockSize*PackedDocBlockNum</sup>
* <li>SkipData --> <<SkipLevelLength, SkipLevel>
- * <sup>NumSkipLevels-1</sup>, SkipLevel> <SkipDatum?></li>
+ * <sup>NumSkipLevels-1</sup>, SkipLevel>, SkipDatum?</li>
* <li>SkipLevel --> <SkipDatum> <sup>TrimmedDocFreq/(PackedBlockSize^(Level + 1))</sup></li>
* <li>SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
* OffsetStart?, PayFPSkip?>?, SkipChildLevelPointer?</li>
@@ -267,11 +267,11 @@ import org.apache.lucene.util.packed.Pac
* <p>The .pos file contains the lists of positions that each term occurs at within documents. It also
* sometimes stores part of payloads and offsets for speedup.</p>
* <ul>
- * <li>Pos(.pos) --> Header, <TermPositions> <sup>TermCount</sup></li>
+ * <li>PosFile(.pos) --> Header, <TermPositions> <sup>TermCount</sup></li>
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>,
* VIntBlock? </li>
- * <li>VIntBlock --> PosVIntCount <PosDelta[, PayLength?], PayData?,
+ * <li>VIntBlock --> PosVIntCount, <PosDelta[, PayLength?], PayData?,
* OffsetStartDelta?, OffsetLength?><sup>PosVIntCount</sup>
* <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}</li>
* <li>PosVIntCount, PosDelta, OffsetStartDelta, OffsetLength -->
@@ -283,7 +283,9 @@ import org.apache.lucene.util.packed.Pac
* <li>TermPositions are order by term (terms are implicit, from the term dictionary), and position
* values for each term document pair are incremental, and ordered by document number.</li>
* <li>PackedPosBlockNum is the number of packed blocks for current term's positions, payloads or offsets.
- * In particular, PackedDocBlockNum = floor(totalTermFreq/PackedBlockSize) </li>
+ * In particular, PackedPosBlockNum = floor(totalTermFreq/PackedBlockSize) </li>
+ * <li>PosVIntCount is the number of positions encoded as VInt format. In particular,
+ * PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize</li>
* <li>The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
* in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.</li>
* <li>PosDelta is the same as the format mentioned in
@@ -302,12 +304,13 @@ import org.apache.lucene.util.packed.Pac
* <dl>
* <dd>
* <b>Payloads and Offsets</b>
- * <p>The .pay file will store payload and offset associated with certain term-document positons.
+ * <p>The .pay file will store payloads and offsets associated with certain term-document positons.
* Some payloads and offsets will be seperated out into .pos file, for speedup reason.</p>
* <ul>
* <li>PayFile(.pay): --> Header, <TermPayloads, TermOffsets?> <sup>TermCount</sup></li>
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
- * <li>TermPayloads --> <PackedPayLengthBlock, PayBlockLength, PayData, PackedOffsetStartDeltaBlock?, PackedOffsetLengthBlock?> <sup>PackedPayBlockNum</sup>
+ * <li>TermPayloads --> <PackedPayLengthBlock, PayBlockLength, PayData> <sup>PackedPayBlockNum</sup>
+ * <li>TermOffsets --> <PackedOffsetStartDeltaBlock?, PackedOffsetLengthBlock?> <sup>PackedPayBlockNum</sup>
* <li>PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> {@link PackedInts PackedInts}</li>
* <li>PayBlockLength --> {@link DataOutput#writeVInt VInt}</li>
* <li>PayData --> {@link DataOutput#writeByte byte}<sup>PayBlockLength</sup></li>
@@ -319,11 +322,13 @@ import org.apache.lucene.util.packed.Pac
* <li>The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is the
* same as PackedFreqBlock in chapter <a href="#Frequencies">Frequencies and Skip Data</a>.
* While PackedStartDeltaBlock follows a same procedure as PackedDocDeltaBlock.</li>
+ * <li>PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also synonym
+ * for PackedOffsetBlockNum.</li>
* <li>PayBlockLength is the total length of payloads written within one block, should be the sum
* of PayLengths in one packed block.</li>
* <li>PayLength in PackedPayLengthBlock is the length of each payload, associated with current
* position.</li>
- * </u>
+ * </ul>
* </dd>
* </dl>
* </p>
@@ -331,13 +336,31 @@ import org.apache.lucene.util.packed.Pac
*/
public final class BlockPostingsFormat extends PostingsFormat {
+ /**
+ * Filename extension for document number, frequencies, and skip data.
+ * See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
+ */
public static final String DOC_EXTENSION = "doc";
+
+ /**
+ * Filename extension for positions.
+ * See chapter: <a href="#Positions">Positions</a>
+ */
public static final String POS_EXTENSION = "pos";
+
+ /**
+ * Filename extension for payloads and offsets.
+ * See chapter: <a href="#Payloads">Payloads and Offsets</a>
+ */
public static final String PAY_EXTENSION = "pay";
private final int minTermBlockSize;
private final int maxTermBlockSize;
+ /**
+ * Fixed packed block size, number of integers encoded in
+ * a single packed block.
+ */
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
public final static int BLOCK_SIZE = 128;
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java?rev=1374620&r1=1374619&r2=1374620&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java Sat Aug 18 17:59:17 2012
@@ -502,6 +502,8 @@ final class BlockPostingsReader extends
skipped = true;
}
+ // always plus one to fix the result, since skip position in BlockSkipReader
+ // is a little different from MultiLevelSkipListReader
final int newDocUpto = skipper.skipTo(target) + 1;
if (newDocUpto > docUpto) {
@@ -517,6 +519,8 @@ final class BlockPostingsReader extends
accum = skipper.getDoc(); // actually, this is just lastSkipEntry
docIn.seek(skipper.getDocPointer()); // now point to the block we want to search
}
+ // next time we call advance, this is used to
+ // foresee whether skipper is necessary.
nextSkipDoc = skipper.getNextSkipDoc();
}
if (docUpto == docFreq) {
Modified: lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java?rev=1374620&r1=1374619&r2=1374620&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/core/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java Sat Aug 18 17:59:17 2012
@@ -52,6 +52,10 @@ import org.apache.lucene.util.packed.Pac
*/
final class BlockPostingsWriter extends PostingsWriterBase {
+ /**
+ * Expert: The maximum number of skip levels. Smaller values result in
+ * slightly smaller indexes, but slower skipping in big posting lists.
+ */
static final int maxSkipLevels = 10;
final static String TERMS_CODEC = "BlockPostingsWriterTerms";