You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/04/26 00:44:06 UTC

svn commit: r1330591 - in /lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40: Lucene40SegmentInfosFormat.java Lucene40TermVectorsFormat.java

Author: rmuir
Date: Wed Apr 25 22:44:05 2012
New Revision: 1330591

URL: http://svn.apache.org/viewvc?rev=1330591&view=rev
Log:
LUCENE-2946: doc 4.0 term vectors format

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java?rev=1330591&r1=1330590&r2=1330591&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40SegmentInfosFormat.java Wed Apr 25 22:44:05 2012
@@ -1,17 +1,5 @@
 package org.apache.lucene.codecs.lucene40;
 
-import org.apache.lucene.codecs.Codec; // javadocs
-import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
-import org.apache.lucene.codecs.SegmentInfosFormat;
-import org.apache.lucene.codecs.SegmentInfosReader;
-import org.apache.lucene.codecs.SegmentInfosWriter;
-import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
-import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
-import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
-import org.apache.lucene.index.IndexWriter; // javadocs
-import org.apache.lucene.index.SegmentInfos; // javadocs
-import org.apache.lucene.store.DataOutput; // javadocs
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -29,6 +17,18 @@ import org.apache.lucene.store.DataOutpu
  * limitations under the License.
  */
 
+import org.apache.lucene.codecs.Codec; // javadocs
+import org.apache.lucene.codecs.LiveDocsFormat; // javadocs
+import org.apache.lucene.codecs.SegmentInfosFormat;
+import org.apache.lucene.codecs.SegmentInfosReader;
+import org.apache.lucene.codecs.SegmentInfosWriter;
+import org.apache.lucene.codecs.StoredFieldsFormat; // javadocs
+import org.apache.lucene.codecs.TermVectorsFormat; // javadocs
+import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
+import org.apache.lucene.index.IndexWriter; // javadocs
+import org.apache.lucene.index.SegmentInfos; // javadocs
+import org.apache.lucene.store.DataOutput; // javadocs
+
 /**
  * Lucene 4.0 Segments format.
  * <p>

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java?rev=1330591&r1=1330590&r2=1330591&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsFormat.java Wed Apr 25 22:44:05 2012
@@ -25,9 +25,81 @@ import org.apache.lucene.codecs.TermVect
 import org.apache.lucene.codecs.TermVectorsWriter;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.DataOutput; // javadocs
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 
+/**
+ * Lucene 4.0 Term Vectors format.
+ * <p>Term Vector support is an optional on a field by field basis. It consists of
+ * 3 files.</p>
+ * <ol>
+ * <li><a name="tvx" id="tvx"></a>
+ * <p>The Document Index or .tvx file.</p>
+ * <p>For each document, this stores the offset into the document data (.tvd) and
+ * field data (.tvf) files.</p>
+ * <p>DocumentIndex (.tvx) --&gt; TVXVersion&lt;DocumentPosition,FieldPosition&gt;
+ * <sup>NumDocs</sup></p>
+ * <ul>
+ *   <li>TVXVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
+ *   <li>DocumentPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvd file)</li>
+ *   <li>FieldPosition --&gt; {@link DataOutput#writeLong UInt64} (offset in the .tvf file)</li>
+ * </ul>
+ * </li>
+ * <li><a name="tvd" id="tvd"></a>
+ * <p>The Document or .tvd file.</p>
+ * <p>This contains, for each document, the number of fields, a list of the fields
+ * with term vector info and finally a list of pointers to the field information
+ * in the .tvf (Term Vector Fields) file.</p>
+ * <p>The .tvd file is used to map out the fields that have term vectors stored
+ * and where the field information is in the .tvf file.</p>
+ * <p>Document (.tvd) --&gt; TVDVersion&lt;NumFields, FieldNums,
+ * FieldPositions&gt; <sup>NumDocs</sup></p>
+ * <ul>
+ *   <li>TVDVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
+ *   <li>NumFields --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>FieldNums --&gt; &lt;FieldNumDelta&gt; <sup>NumFields</sup></li>
+ *   <li>FieldNumDelta --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>FieldPositions --&gt; &lt;FieldPositionDelta&gt; <sup>NumFields-1</sup></li>
+ *   <li>FieldPositionDelta --&gt; {@link DataOutput#writeVLong VLong}</li>
+ * </ul>
+ * </li>
+ * <li><a name="tvf" id="tvf"></a>
+ * <p>The Field or .tvf file.</p>
+ * <p>This file contains, for each field that has a term vector stored, a list of
+ * the terms, their frequencies and, optionally, position and offset
+ * information.</p>
+ * <p>Field (.tvf) --&gt; TVFVersion&lt;NumTerms, Position/Offset, TermFreqs&gt;
+ * <sup>NumFields</sup></p>
+ * <ul>
+ *   <li>TVFVersion --&gt; {@link DataOutput#writeInt Int32} (<code>Lucene40TermVectorsReader.FORMAT_CURRENT</code>)</li>
+ *   <li>NumTerms --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>Position/Offset --&gt; {@link DataOutput#writeByte Byte}</li>
+ *   <li>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, Offsets?&gt;
+ *       <sup>NumTerms</sup></li>
+ *   <li>TermText --&gt; &lt;PrefixLength, Suffix&gt;</li>
+ *   <li>PrefixLength --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>Suffix --&gt; {@link DataOutput#writeString String}</li>
+ *   <li>TermFreq --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>Positions --&gt; &lt;{@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
+ *   <li>Offsets --&gt; &lt;{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>Position/Offset byte stores whether this term vector has position or offset
+ * information stored.</li>
+ * <li>Term byte prefixes are shared. The PrefixLength is the number of initial
+ * bytes from the previous term which must be pre-pended to a term's suffix
+ * in order to form the term's bytes. Thus, if the previous term's text was "bone"
+ * and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
+ * <li>Positions are stored as delta encoded VInts. This means we only store the
+ * difference of the current position from the last position</li>
+ * <li>Offsets are stored as delta encoded VInts. The first VInt is the
+ * startOffset, the second is the endOffset.</li>
+ * </ul>
+ * </li>
+ * </ol>
+ */
 public class Lucene40TermVectorsFormat extends TermVectorsFormat {
 
   @Override