You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/07/04 17:16:40 UTC
svn commit: r553236 [1/6] - in /lucene/java/trunk: ./ contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/ docs/ src/java/org/apache/lucene/analysis/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/store/ src/site/src/documentati...

Author: mikemccand
Date: Wed Jul  4 08:16:38 2007
New Revision: 553236

URL: http://svn.apache.org/viewvc?view=rev&rev=553236
Log:
LUCENE-843: speed up IndexWriter performance

Added:
    lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/index/index.presharedstores.cfs.zip   (with props)
    lucene/java/trunk/src/test/org/apache/lucene/index/index.presharedstores.nocfs.zip   (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
    lucene/java/trunk/docs/fileformats.html
    lucene/java/trunk/docs/fileformats.pdf
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfo.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldInfos.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileDeleter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexModifier.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfos.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsReader.java
    lucene/java/trunk/src/java/org/apache/lucene/store/IndexOutput.java
    lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
    lucene/java/trunk/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestDeletionPolicy.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexFileDeleter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexModifier.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexReader.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriterDelete.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestLazyProxSkipping.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestPayloads.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestStressIndexing.java
    lucene/java/trunk/src/test/org/apache/lucene/search/TestTermVectors.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Jul  4 08:16:38 2007
@@ -105,6 +105,13 @@
     to be public because it implements the public interface TermPositionVector.
     (Michael Busch)
 
+14. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have
+    IndexWriter flush whenever the buffered documents are using more
+    than the specified amount of RAM.  Also added new APIs to Token
+    that allow one to set a char[] plus offset and length to specify a
+    token (to avoid creating a new String() for each Token).  (Mike
+    McCandless)
+ 
 Bug fixes
 
  1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist.  (Doron Cohen)
@@ -267,6 +274,12 @@
     lists. For average AND queries the speedup is about 20%, for queries that 
     contain very frequent and very unique terms the speedup can be over 80%.
     (Michael Busch)
+
+ 8. LUCENE-843: Substantial optimizations to improve how IndexWriter
+    uses RAM for buffering documents and to speed up indexing (2X-8X
+    faster).  A single shared hash table now records the in-memory
+    postings per unique term and is directly flushed into a single
+    segment.  (Mike McCandless)
  
 Documentation
 

Modified: lucene/java/trunk/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java (original)
+++ lucene/java/trunk/contrib/xml-query-parser/src/test/org/apache/lucene/xmlparser/TestParser.java Wed Jul  4 08:16:38 2007
@@ -77,6 +77,7 @@
 				line=d.readLine();
 			}			
 			d.close();
+                        writer.close();
 		}
 		reader=IndexReader.open(dir);
 		searcher=new IndexSearcher(reader);

Modified: lucene/java/trunk/docs/fileformats.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/docs/fileformats.html?view=diff&rev=553236&r1=553235&r2=553236
==============================================================================
--- lucene/java/trunk/docs/fileformats.html (original)
+++ lucene/java/trunk/docs/fileformats.html Wed Jul  4 08:16:38 2007
@@ -380,10 +380,18 @@
                 But note that once a commit has occurred, pre-2.1
                 Lucene will not be able to read the index.
             </p>
+<p>
+                In version 2.3, the file format was changed to allow
+		segments to share a single set of doc store (vectors &amp;
+		stored fields) files.  This allows for faster indexing
+		in certain cases.  The change is fully backwards
+		compatible (in the same way as the lock-less commits
+		change in 2.1).
+            </p>
 </div>
 
         
-<a name="N10032"></a><a name="Definitions"></a>
+<a name="N10035"></a><a name="Definitions"></a>
 <h2 class="boxed">Definitions</h2>
 <div class="section">
 <p>
@@ -424,7 +432,7 @@
                 strings, the first naming the field, and the second naming text
                 within the field.
             </p>
-<a name="N10052"></a><a name="Inverted Indexing"></a>
+<a name="N10055"></a><a name="Inverted Indexing"></a>
 <h3 class="boxed">Inverted Indexing</h3>
 <p>
                     The index stores statistics about terms in order
@@ -434,7 +442,7 @@
                     it.  This is the inverse of the natural relationship, in which
                     documents list terms.
                 </p>
-<a name="N1005E"></a><a name="Types of Fields"></a>
+<a name="N10061"></a><a name="Types of Fields"></a>
 <h3 class="boxed">Types of Fields</h3>
 <p>
                     In Lucene, fields may be <i>stored</i>, in which
@@ -448,7 +456,7 @@
                     to be indexed literally.
                 </p>
 <p>See the <a href="http://lucene.apache.org/java/docs/api/org/apache/lucene/document/Field.html">Field</a> java docs for more information on Fields.</p>
-<a name="N1007B"></a><a name="Segments"></a>
+<a name="N1007E"></a><a name="Segments"></a>
 <h3 class="boxed">Segments</h3>
 <p>
                     Lucene indexes may be composed of multiple sub-indexes, or
@@ -474,7 +482,7 @@
                     Searches may involve multiple segments and/or multiple indexes, each
                     index potentially composed of a set of segments.
                 </p>
-<a name="N10099"></a><a name="Document Numbers"></a>
+<a name="N1009C"></a><a name="Document Numbers"></a>
 <h3 class="boxed">Document Numbers</h3>
 <p>
                     Internally, Lucene refers to documents by an integer <i>document
@@ -529,7 +537,7 @@
 </div>
 
         
-<a name="N100C0"></a><a name="Overview"></a>
+<a name="N100C3"></a><a name="Overview"></a>
 <h2 class="boxed">Overview</h2>
 <div class="section">
 <p>
@@ -626,7 +634,7 @@
 </div>
 
         
-<a name="N10103"></a><a name="File Naming"></a>
+<a name="N10106"></a><a name="File Naming"></a>
 <h2 class="boxed">File Naming</h2>
 <div class="section">
 <p>
@@ -654,10 +662,10 @@
 </div>
 
         
-<a name="N10112"></a><a name="Primitive Types"></a>
+<a name="N10115"></a><a name="Primitive Types"></a>
 <h2 class="boxed">Primitive Types</h2>
 <div class="section">
-<a name="N10117"></a><a name="Byte"></a>
+<a name="N1011A"></a><a name="Byte"></a>
 <h3 class="boxed">Byte</h3>
 <p>
                     The most primitive type
@@ -665,7 +673,7 @@
                     other data types are defined as sequences
                     of bytes, so file formats are byte-order independent.
                 </p>
-<a name="N10120"></a><a name="UInt32"></a>
+<a name="N10123"></a><a name="UInt32"></a>
 <h3 class="boxed">UInt32</h3>
 <p>
                     32-bit unsigned integers are written as four
@@ -675,7 +683,7 @@
                     UInt32    --&gt; &lt;Byte&gt;<sup>4</sup>
                 
 </p>
-<a name="N1012F"></a><a name="Uint64"></a>
+<a name="N10132"></a><a name="Uint64"></a>
 <h3 class="boxed">Uint64</h3>
 <p>
                     64-bit unsigned integers are written as eight
@@ -684,7 +692,7 @@
 <p>UInt64    --&gt; &lt;Byte&gt;<sup>8</sup>
                 
 </p>
-<a name="N1013E"></a><a name="VInt"></a>
+<a name="N10141"></a><a name="VInt"></a>
 <h3 class="boxed">VInt</h3>
 <p>
                     A variable-length format for positive integers is
@@ -1234,7 +1242,7 @@
                     This provides compression while still being
                     efficient to decode.
                 </p>
-<a name="N10423"></a><a name="Chars"></a>
+<a name="N10426"></a><a name="Chars"></a>
 <h3 class="boxed">Chars</h3>
 <p>
                     Lucene writes unicode
@@ -1243,7 +1251,7 @@
                         UTF-8 encoding"</a>
                     .
                 </p>
-<a name="N10430"></a><a name="String"></a>
+<a name="N10433"></a><a name="String"></a>
 <h3 class="boxed">String</h3>
 <p>
                     Lucene writes strings as a VInt representing the length, followed by
@@ -1255,13 +1263,13 @@
 </div>
 
         
-<a name="N1043D"></a><a name="Per-Index Files"></a>
+<a name="N10440"></a><a name="Per-Index Files"></a>
 <h2 class="boxed">Per-Index Files</h2>
 <div class="section">
 <p>
                 The files in this section exist one-per-index.
             </p>
-<a name="N10445"></a><a name="Segments File"></a>
+<a name="N10448"></a><a name="Segments File"></a>
 <h3 class="boxed">Segments File</h3>
 <p>
                     The active segments in the index are stored in the
@@ -1316,16 +1324,24 @@
                 
 </p>
 <p>
-                    Format, NameCounter, SegCount, SegSize, NumField --&gt; Int32
+                    
+<b>2.3 and above:</b>
+                    Segments --&gt; Format, Version, NameCounter, SegCount, &lt;SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
+                    NormGen<sup>NumField</sup>,
+                    IsCompoundFile&gt;<sup>SegCount</sup>
+                
+</p>
+<p>
+                    Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset --&gt; Int32
                 </p>
 <p>
                     Version, DelGen, NormGen --&gt; Int64
                 </p>
 <p>
-                    SegName --&gt; String
+                    SegName, DocStoreSegment --&gt; String
                 </p>
 <p>
-                    IsCompoundFile, HasSingleNormFile --&gt; Int8
+                    IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile --&gt; Int8
                 </p>
 <p>
                     Format is -1 as of Lucene 1.4 and -3 (SemgentInfos.FORMAT_SINGLE_NORM_FILE) as of Lucene 2.1.
@@ -1380,7 +1396,28 @@
                     are stored as separate <tt>.fN</tt> files.  See
                     "Normalization Factors" below for details.
                 </p>
-<a name="N104A9"></a><a name="Lock File"></a>
+<p>
+		    DocStoreOffset, DocStoreSegment,
+                    DocStoreIsCompoundFile: If DocStoreOffset is -1,
+                    this segment has its own doc store (stored fields
+                    values and term vectors) files and DocStoreSegment
+                    and DocStoreIsCompoundFile are not stored.  In
+                    this case all files for stored field values
+                    (<tt>*.fdt</tt> and <tt>*.fdx</tt>) and term
+                    vectors (<tt>*.tvf</tt>, <tt>*.tvd</tt> and
+                    <tt>*.tvx</tt>) will be stored with this segment.
+                    Otherwise, DocStoreSegment is the name of the
+                    segment that has the shared doc store files;
+                    DocStoreIsCompoundFile is 1 if that segment is
+                    stored in compound file format (as a <tt>.cfx</tt>
+                    file); and DocStoreOffset is the starting document
+                    in the shared doc store files where this segment's
+                    documents begin.  In this case, this segment does
+                    not store its own doc store files but instead
+                    shares a single set of these files with other
+                    segments.
+                </p>
+<a name="N104CD"></a><a name="Lock File"></a>
 <h3 class="boxed">Lock File</h3>
 <p>
                     The write lock, which is stored in the index
@@ -1398,7 +1435,7 @@
                     Note that prior to version 2.1, Lucene also used a
                     commit lock. This was removed in 2.1.
                 </p>
-<a name="N104B5"></a><a name="Deletable File"></a>
+<a name="N104D9"></a><a name="Deletable File"></a>
 <h3 class="boxed">Deletable File</h3>
 <p>
                     Prior to Lucene 2.1 there was a file "deletable"
@@ -1407,7 +1444,7 @@
                     the files that are deletable, instead, so no file
                     is written.
                 </p>
-<a name="N104BE"></a><a name="Compound Files"></a>
+<a name="N104E2"></a><a name="Compound Files"></a>
 <h3 class="boxed">Compound Files</h3>
 <p>Starting with Lucene 1.4 the compound file format became default. This
                     is simply a container for all files described in the next section
@@ -1424,17 +1461,24 @@
 <p>FileName --&gt; String</p>
 <p>FileData --&gt; raw file data</p>
 <p>The raw file data is the data from the individual files named above.</p>
+<p>Starting with Lucene 2.3, doc store files (stored
+		field values and term vectors) can be shared in a
+		single set of files for more than one segment.  When
+		compound file is enabled, these shared files will be
+		added into a single compound file (same format as
+		above) but with the extension <tt>.cfx</tt>.
+		</p>
 </div>
 
         
-<a name="N104E0"></a><a name="Per-Segment Files"></a>
+<a name="N1050A"></a><a name="Per-Segment Files"></a>
 <h2 class="boxed">Per-Segment Files</h2>
 <div class="section">
 <p>
                 The remaining files are all per-segment, and are
                 thus defined by suffix.
             </p>
-<a name="N104E8"></a><a name="Fields"></a>
+<a name="N10512"></a><a name="Fields"></a>
 <h3 class="boxed">Fields</h3>
 <p>
                     
@@ -1653,7 +1697,7 @@
 </li>
                 
 </ol>
-<a name="N105A3"></a><a name="Term Dictionary"></a>
+<a name="N105CD"></a><a name="Term Dictionary"></a>
 <h3 class="boxed">Term Dictionary</h3>
 <p>
                     The term dictionary is represented as two files:
@@ -1839,7 +1883,7 @@
 </li>
                 
 </ol>
-<a name="N10623"></a><a name="Frequencies"></a>
+<a name="N1064D"></a><a name="Frequencies"></a>
 <h3 class="boxed">Frequencies</h3>
 <p>
                     The .frq file contains the lists of documents
@@ -1957,7 +2001,7 @@
                    entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer
                    to entry 31 on level 0.                   
                 </p>
-<a name="N106A5"></a><a name="Positions"></a>
+<a name="N106CF"></a><a name="Positions"></a>
 <h3 class="boxed">Positions</h3>
 <p>
                     The .prx file contains the lists of positions that
@@ -2023,7 +2067,7 @@
                     Payload. If PayloadLength is not stored, then this Payload has the same
                     length as the Payload at the previous position.
                 </p>
-<a name="N106E1"></a><a name="Normalization Factors"></a>
+<a name="N1070B"></a><a name="Normalization Factors"></a>
 <h3 class="boxed">Normalization Factors</h3>
 <p>
                     
@@ -2127,7 +2171,7 @@
 <b>2.1 and above:</b>
                     Separate norm files are created (when adequate) for both compound and non compound segments.
                 </p>
-<a name="N1074A"></a><a name="Term Vectors"></a>
+<a name="N10774"></a><a name="Term Vectors"></a>
 <h3 class="boxed">Term Vectors</h3>
 <ol>
                     
@@ -2253,7 +2297,7 @@
 </li>
                 
 </ol>
-<a name="N107DD"></a><a name="Deleted Documents"></a>
+<a name="N10807"></a><a name="Deleted Documents"></a>
 <h3 class="boxed">Deleted Documents</h3>
 <p>The .del file is
                     optional, and only exists when a segment contains deletions.
@@ -2325,7 +2369,7 @@
 </div>
 
         
-<a name="N10820"></a><a name="Limitations"></a>
+<a name="N1084A"></a><a name="Limitations"></a>
 <h2 class="boxed">Limitations</h2>
 <div class="section">
 <p>There