You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/04/22 02:07:16 UTC

svn commit: r1328748 [1/2] - in /lucene/dev/trunk: lucene/ lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/ lucene/site/ lucene/site/build/ lucene/site/html/ lucene/site/src/ lucene/site/xsl/ solr/

Author: uschindler
Date: Sun Apr 22 00:07:15 2012
New Revision: 1328748

URL: http://svn.apache.org/viewvc?rev=1328748&view=rev
Log:
LUCENE-4007: Forrest Chainsaw Massacre

Added:
    lucene/dev/trunk/lucene/site/html/
    lucene/dev/trunk/lucene/site/html/demo.html   (with props)
    lucene/dev/trunk/lucene/site/html/demo2.html   (with props)
    lucene/dev/trunk/lucene/site/html/fileformats.html   (with props)
    lucene/dev/trunk/lucene/site/html/lucene_green_300.gif   (with props)
    lucene/dev/trunk/lucene/site/html/scoring.html   (with props)
    lucene/dev/trunk/lucene/site/xsl/
    lucene/dev/trunk/lucene/site/xsl/index.xsl   (with props)
Removed:
    lucene/dev/trunk/lucene/index.html
    lucene/dev/trunk/lucene/site/build/
    lucene/dev/trunk/lucene/site/forrest.properties
    lucene/dev/trunk/lucene/site/src/
Modified:
    lucene/dev/trunk/lucene/build.xml
    lucene/dev/trunk/lucene/common-build.xml
    lucene/dev/trunk/lucene/module-build.xml
    lucene/dev/trunk/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html
    lucene/dev/trunk/solr/common-build.xml

Modified: lucene/dev/trunk/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/build.xml?rev=1328748&r1=1328747&r2=1328748&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/build.xml (original)
+++ lucene/dev/trunk/lucene/build.xml Sun Apr 22 00:07:15 2012
@@ -174,18 +174,6 @@
     <license-check-macro dir="${basedir}" />
   </target>
 
-  <!-- ================================================================== -->
-  <!-- D O C U M E N T A T I O N                                          -->
-  <!-- ================================================================== -->
-  <!--                                                                    -->
-  <!-- ================================================================== -->
-  <target name="docs">
-  <!-- copies the docs over to the docs folder -->
-    <copy todir="build/docs">
-      <fileset dir="site/build/site"/> 
-    </copy>
-  </target>
-
   <target name="resolve">
     <sequential>
       <ant dir="test-framework" target="resolve" inheritall="false">
@@ -195,20 +183,26 @@
     </sequential>
   </target>
 
+  <target name="documentation" description="Generate all documentation"
+    depends="javadocs,changes-to-html,doc-index"/>
   <target name="javadoc" depends="javadocs"/>
-  <target name="javadocs" description="Generate javadoc" 
-          depends="javadocs-lucene-core, javadocs-modules, javadocs-test-framework">
-    <echo file="${javadoc.dir}/index.html" append="false">
-<![CDATA[<html><head><title>${Name} ${version} Javadoc Index</title></head>
-<body>
-<h1>${Name} ${version} Javadoc Index</h1>
-<ul>
-  <li><a href="core/index.html">core</a>: Lucene core library</li>
-  <li><a href="test-framework/index.html">test-framework</a>: Framework for testing Lucene-based applications</li>
-]]></echo>
-    <modules-crawl target="javadocs-index.html" failonerror="true"/>
-    <echo file="${javadoc.dir}/index.html" append="true"><![CDATA[
-</ul></body>]]></echo>
+  <target name="javadocs" description="Generate javadoc" depends="javadocs-lucene-core, javadocs-modules, javadocs-test-framework"/>
+  
+  <target name="doc-index">
+    <pathconvert pathsep="|" dirsep="/" property="buildfiles">
+      <fileset dir="." includes="**/build.xml" excludes="build.xml,analysis/*,build/**,tools/**,backwards/**,site/**"/>
+    </pathconvert>
+    <xslt in="${ant.file}" out="${javadoc.dir}/index.html" style="site/xsl/index.xsl">
+      <outputproperty name="method" value="html"/>
+      <outputproperty name="version" value="4.0"/>
+      <outputproperty name="encoding" value="UTF-8"/>
+      <outputproperty name="indent" value="yes"/>
+      <param name="buildfiles" expression="${buildfiles}"/>
+      <param name="version" expression="${version}"/>
+    </xslt>
+    <copy todir="${javadoc.dir}">
+      <fileset dir="site/html" includes="**/*"/>
+    </copy>
   </target>
 	
   <target name="javadocs-modules" description="Generate javadoc for modules classes">
@@ -230,7 +224,7 @@
   <!-- ================================================================== -->
   <!--                                                                    -->
   <!-- ================================================================== -->
-  <target name="package" depends="jar-core, jar-test-framework, docs, javadocs, build-modules, init-dist, changes-to-html"/>
+  <target name="package" depends="jar-core, jar-test-framework, build-modules, init-dist, documentation"/>
 
   <target name="nightly" depends="test, package-tgz">
   </target>

Modified: lucene/dev/trunk/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/common-build.xml?rev=1328748&r1=1328747&r2=1328748&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/common-build.xml (original)
+++ lucene/dev/trunk/lucene/common-build.xml Sun Apr 22 00:07:15 2012
@@ -124,7 +124,7 @@
   <available file="${javadoc.packagelist.dir}/java6/package-list" property="javadoc.java6.packagelist.exists"/>
   <property name="javadoc.access" value="protected"/>
   <property name="javadoc.charset" value="utf-8"/>
-  <property name="javadoc.dir" value="${common.dir}/build/docs/api"/>
+  <property name="javadoc.dir" value="${common.dir}/build/docs"/>
   <property name="javadoc.maxmemory" value="512m" />
   <!-- Javadoc classpath -->
   <path id="javadoc.classpath">

Modified: lucene/dev/trunk/lucene/module-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/module-build.xml?rev=1328748&r1=1328747&r2=1328748&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/module-build.xml (original)
+++ lucene/dev/trunk/lucene/module-build.xml Sun Apr 22 00:07:15 2012
@@ -90,14 +90,6 @@
     </sequential>
   </macrodef>
 
-  <target name="javadocs-index.html" description="Generate line for index.html of JavaDocs">
-    <xmlproperty file="${ant.file}" collapseAttributes="true"/>
-    <echo file="${javadoc.dir}/index.html" append="true">
-<![CDATA[
-  <li><a href="${name}/index.html">${name}</a>: ${project.description}</li>
-]]></echo>
-  </target>
-
   <property name="queryparser.jar" value="${common.dir}/build/queryparser/lucene-queryparser-${version}.jar"/>
   <target name="check-queryparser-uptodate" unless="queryparser.uptodate">
     <module-uptodate name="queryparser" jarfile="${queryparser.jar}" property="queryparser.uptodate"/>

Modified: lucene/dev/trunk/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html?rev=1328748&r1=1328747&r2=1328748&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html (original)
+++ lucene/dev/trunk/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html Sun Apr 22 00:07:15 2012
@@ -31,5 +31,272 @@ must always be fully qualified in source
 <p><b>NOTE</b>: {@link org.apache.lucene.queryparser.flexible.standard} has an alternative queryparser that matches the syntax of this one, but is more modular,
 enabling substantial customization to how a query is created.
 
+<h2>Query Parser Syntax</h2>
+
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Overview">Overview</a>
+</li>
+<li>
+<a href="#Terms">Terms</a>
+</li>
+<li>
+<a href="#Fields">Fields</a>
+</li>
+<li>
+<a href="#Term Modifiers">Term Modifiers</a>
+<ul class="minitoc">
+<li>
+<a href="#Wildcard Searches">Wildcard Searches</a>
+</li>
+<li>
+<a href="#Fuzzy Searches">Fuzzy Searches</a>
+</li>
+<li>
+<a href="#Proximity Searches">Proximity Searches</a>
+</li>
+<li>
+<a href="#Range Searches">Range Searches</a>
+</li>
+<li>
+<a href="#Boosting a Term">Boosting a Term</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Boolean operators">Boolean Operators</a>
+<ul class="minitoc">
+<li>
+<a href="#OR"></a>
+</li>
+<li>
+<a href="#AND">AND</a>
+</li>
+<li>
+<a href="#+">+</a>
+</li>
+<li>
+<a href="#NOT">NOT</a>
+</li>
+<li>
+<a href="#-">-</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Grouping">Grouping</a>
+</li>
+<li>
+<a href="#Field Grouping">Field Grouping</a>
+</li>
+<li>
+<a href="#Escaping Special Characters">Escaping Special Characters</a>
+</li>
+</ul>
+</div>
+        
+<a name="N10013"></a><a name="Overview"></a>
+<h2 class="boxed">Overview</h2>
+<div class="section">
+<p>Although Lucene provides the ability to create your own
+            queries through its API, it also provides a rich query
+            language through the Query Parser, a lexer which
+            interprets a string into a Lucene Query using JavaCC.
+	    </p>
+<p>Generally, the query parser syntax may change from
+	    release to release.  This page describes the syntax as of
+	    the current release.  If you are using a different
+	    version of Lucene, please consult the copy of
+	    <span class="codefrag">docs/queryparsersyntax.html</span> that was distributed
+	    with the version you are using.
+	    </p>
+<p>
+            Before choosing to use the provided Query Parser, please consider the following:
+            <ol>
+            
+<li>If you are programmatically generating a query string and then
+            parsing it with the query parser then you should seriously consider building
+            your queries directly with the query API.  In other words, the query
+            parser is designed for human-entered text, not for program-generated
+            text.</li>
+
+            
+<li>Untokenized fields are best added directly to queries, and not
+            through the query parser.  If a field's values are generated programmatically
+            by the application, then so should query clauses for this field.
+            An analyzer, which the query parser uses, is designed to convert human-entered
+            text to terms.  Program-generated values, like dates, keywords, etc.,
+            should be consistently program-generated.</li>
+
+            
+<li>In a query form, fields which are general text should use the query
+            parser.  All others, such as date ranges, keywords, etc. are better added
+            directly through the query API.  A field with a limit set of values,
+            that can be specified with a pull-down menu should not be added to a
+            query string which is subsequently parsed, but rather added as a
+            TermQuery clause.</li>
+            
+</ol>
+            
+</p>
+</div>
+
+        
+<a name="N10032"></a><a name="Terms"></a>
+<h2 class="boxed">Terms</h2>
+<div class="section">
+<p>A query is broken up into terms and operators. There are two types of terms: Single Terms and Phrases.</p>
+<p>A Single Term is a single word such as "test" or "hello".</p>
+<p>A Phrase is a group of words surrounded by double quotes such as "hello dolly".</p>
+<p>Multiple terms can be combined together with Boolean operators to form a more complex query (see below).</p>
+<p>Note: The analyzer used to create the index will be used on the terms and phrases in the query string.
+        So it is important to choose an analyzer that will not interfere with the terms used in the query string.</p>
+</div>
+
+        
+<a name="N10048"></a><a name="Fields"></a>
+<h2 class="boxed">Fields</h2>
+<div class="section">
+<p>Lucene supports fielded data. When performing a search you can either specify a field, or use the default field. The field names and default field is implementation specific.</p>
+<p>You can search any field by typing the field name followed by a colon ":" and then the term you are looking for. </p>
+<p>As an example, let's assume a Lucene index contains two fields, title and text and text is the default field.
+        If you want to find the document entitled "The Right Way" which contains the text "don't go this way", you can enter: </p>
+<pre class="code">title:"The Right Way" AND text:go</pre>
+<p>or</p>
+<pre class="code">title:"Do it right" AND right</pre>
+<p>Since text is the default field, the field indicator is not required.</p>
+<p>Note: The field is only valid for the term that it directly precedes, so the query</p>
+<pre class="code">title:Do it right</pre>
+<p>Will only find "Do" in the title field. It will find "it" and "right" in the default field (in this case the text field). </p>
+</div>
+
+        
+<a name="N1006D"></a><a name="Term Modifiers"></a>
+<h2 class="boxed">Term Modifiers</h2>
+<div class="section">
+<p>Lucene supports modifying query terms to provide a wide range of searching options.</p>
+<a name="N10076"></a><a name="Wildcard Searches"></a>
+<h3 class="boxed">Wildcard Searches</h3>
+<p>Lucene supports single and multiple character wildcard searches within single terms
+        (not within phrase queries).</p>
+<p>To perform a single character wildcard search use the "?" symbol.</p>
+<p>To perform a multiple character wildcard search use the "*" symbol.</p>
+<p>The single character wildcard search looks for terms that match that with the single character replaced. For example, to search for "text" or "test" you can use the search:</p>
+<pre class="code">te?t</pre>
+<p>Multiple character wildcard searches looks for 0 or more characters. For example, to search for test, tests or tester, you can use the search: </p>
+<pre class="code">test*</pre>
+<p>You can also use the wildcard searches in the middle of a term.</p>
+<pre class="code">te*t</pre>
+<p>Note: You cannot use a * or ? symbol as the first character of a search.</p>
+<a name="N1009B"></a><a name="Fuzzy Searches"></a>
+<h3 class="boxed">Fuzzy Searches</h3>
+<p>Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search: </p>
+<pre class="code">roam~</pre>
+<p>This search will find terms like foam and roams.</p>
+<p>Starting with Lucene 1.9 an additional (optional) parameter can specify the required similarity. The value is between 0 and 1, with a value closer to 1 only terms with a higher similarity will be matched. For example:</p>
+<pre class="code">roam~0.8</pre>
+<p>The default that is used if the parameter is not given is 0.5.</p>
+<a name="N100B4"></a><a name="Proximity Searches"></a>
+<h3 class="boxed">Proximity Searches</h3>
+<p>Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search: </p>
+<pre class="code">"jakarta apache"~10</pre>
+<a name="N100C1"></a><a name="Range Searches"></a>
+<h3 class="boxed">Range Searches</h3>
+<p>Range Queries allow one to match documents whose field(s) values
+            are between the lower and upper bound specified by the Range Query.
+            Range Queries can be inclusive or exclusive of the upper and lower bounds.
+            Sorting is done lexicographically.</p>
+<pre class="code">mod_date:[20020101 TO 20030101]</pre>
+<p>This will find documents whose mod_date fields have values between 20020101 and 20030101, inclusive.
+            Note that Range Queries are not reserved for date fields.  You could also use range queries with non-date fields:</p>
+<pre class="code">title:{Aida TO Carmen}</pre>
+<p>This will find all documents whose titles are between Aida and Carmen, but not including Aida and Carmen.</p>
+<p>Inclusive range queries are denoted by square brackets.  Exclusive range queries are denoted by
+            curly brackets.</p>
+<a name="N100DA"></a><a name="Boosting a Term"></a>
+<h3 class="boxed">Boosting a Term</h3>
+<p>Lucene provides the relevance level of matching documents based on the terms found. To boost a term use the caret, "^", symbol with a boost factor (a number) at the end of the term you are searching. The higher the boost factor, the more relevant the term will be.</p>
+<p>Boosting allows you to control the relevance of a document by boosting its term. For example, if you are searching for</p>
+<pre class="code">jakarta apache</pre>
+<p>and you want the term "jakarta" to be more relevant boost it using the ^ symbol along with the boost factor next to the term.
+        You would type:</p>
+<pre class="code">jakarta^4 apache</pre>
+<p>This will make documents with the term jakarta appear more relevant. You can also boost Phrase Terms as in the example: </p>
+<pre class="code">"jakarta apache"^4 "Apache Lucene"</pre>
+<p>By default, the boost factor is 1. Although the boost factor must be positive, it can be less than 1 (e.g. 0.2)</p>
+</div>
+
+
+        
+<a name="N100FA"></a><a name="Boolean operators"></a>
+<h2 class="boxed">Boolean Operators</h2>
+<div class="section">
+<p>Boolean operators allow terms to be combined through logic operators.
+        Lucene supports AND, "+", OR, NOT and "-" as Boolean operators(Note: Boolean operators must be ALL CAPS).</p>
+<a name="N10103"></a><a name="OR"></a>
+<h3 class="boxed"></h3>
+<p>The OR operator is the default conjunction operator. This means that if there is no Boolean operator between two terms, the OR operator is used.
+        The OR operator links two terms and finds a matching document if either of the terms exist in a document. This is equivalent to a union using sets.
+        The symbol || can be used in place of the word OR.</p>
+<p>To search for documents that contain either "jakarta apache" or just "jakarta" use the query:</p>
+<pre class="code">"jakarta apache" jakarta</pre>
+<p>or</p>
+<pre class="code">"jakarta apache" OR jakarta</pre>
+<a name="N10116"></a><a name="AND"></a>
+<h3 class="boxed">AND</h3>
+<p>The AND operator matches documents where both terms exist anywhere in the text of a single document.
+        This is equivalent to an intersection using sets. The symbol &amp;&amp; can be used in place of the word AND.</p>
+<p>To search for documents that contain "jakarta apache" and "Apache Lucene" use the query: </p>
+<pre class="code">"jakarta apache" AND "Apache Lucene"</pre>
+<a name="N10126"></a><a name="+"></a>
+<h3 class="boxed">+</h3>
+<p>The "+" or required operator requires that the term after the "+" symbol exist somewhere in a the field of a single document.</p>
+<p>To search for documents that must contain "jakarta" and may contain "lucene" use the query:</p>
+<pre class="code">+jakarta lucene</pre>
+<a name="N10136"></a><a name="NOT"></a>
+<h3 class="boxed">NOT</h3>
+<p>The NOT operator excludes documents that contain the term after NOT.
+        This is equivalent to a difference using sets. The symbol ! can be used in place of the word NOT.</p>
+<p>To search for documents that contain "jakarta apache" but not "Apache Lucene" use the query: </p>
+<pre class="code">"jakarta apache" NOT "Apache Lucene"</pre>
+<p>Note: The NOT operator cannot be used with just one term. For example, the following search will return no results:</p>
+<pre class="code">NOT "jakarta apache"</pre>
+<a name="N1014C"></a><a name="-"></a>
+<h3 class="boxed">-</h3>
+<p>The "-" or prohibit operator excludes documents that contain the term after the "-" symbol.</p>
+<p>To search for documents that contain "jakarta apache" but not "Apache Lucene" use the query: </p>
+<pre class="code">"jakarta apache" -"Apache Lucene"</pre>
+</div>
+
+        
+<a name="N1015D"></a><a name="Grouping"></a>
+<h2 class="boxed">Grouping</h2>
+<div class="section">
+<p>Lucene supports using parentheses to group clauses to form sub queries. This can be very useful if you want to control the boolean logic for a query.</p>
+<p>To search for either "jakarta" or "apache" and "website" use the query:</p>
+<pre class="code">(jakarta OR apache) AND website</pre>
+<p>This eliminates any confusion and makes sure you that website must exist and either term jakarta or apache may exist.</p>
+</div>
+
+        
+<a name="N10170"></a><a name="Field Grouping"></a>
+<h2 class="boxed">Field Grouping</h2>
+<div class="section">
+<p>Lucene supports using parentheses to group multiple clauses to a single field.</p>
+<p>To search for a title that contains both the word "return" and the phrase "pink panther" use the query:</p>
+<pre class="code">title:(+return +"pink panther")</pre>
+</div>
+
+        
+<a name="N10180"></a><a name="Escaping Special Characters"></a>
+<h2 class="boxed">Escaping Special Characters</h2>
+<div class="section">
+<p>Lucene supports escaping special characters that are part of the query syntax. The current list special characters are</p>
+<p>+ - &amp;&amp; || ! ( ) { } [ ] ^ " ~ * ? : \</p>
+<p>To escape these character use the \ before the character. For example to search for (1+1):2 use the query:</p>
+<pre class="code">\(1\+1\)\:2</pre>
+</div>
+
 </body>
 </html>

Added: lucene/dev/trunk/lucene/site/html/demo.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/site/html/demo.html?rev=1328748&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/site/html/demo.html (added)
+++ lucene/dev/trunk/lucene/site/html/demo.html Sun Apr 22 00:07:15 2012
@@ -0,0 +1,72 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title>Apache Lucene - Building and Installing the Basic Demo</title>
+</head>
+<body>
+<h1>Apache Lucene - Building and Installing the Basic Demo</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li><a href="#About%20this%20Document">About this Document</a></li>
+<li><a href="#About%20the%20Demo">About the Demo</a></li>
+<li><a href="#Setting%20your%20CLASSPATH">Setting your CLASSPATH</a></li>
+<li><a href="#Indexing%20Files">Indexing Files</a></li>
+<li><a href="#About%20the%20code...">About the code...</a></li>
+</ul>
+</div>
+<a name="N10013" id="N10013"></a><a name="About this Document"></a>
+<h2 class="boxed">About this Document</h2>
+<div class="section">
+<p>This document is intended as a "getting started" guide to using and running
+the Lucene demos. It walks you through some basic installation and
+configuration.</p>
+</div>
+<a name="N1001C" id="N1001C"></a><a name="About the Demo"></a>
+<h2 class="boxed">About the Demo</h2>
+<div class="section">
+<p>The Lucene command-line demo code consists of an application that
+demonstrates various functionalities of Lucene and how you can add Lucene to
+your applications.</p>
+</div>
+<a name="N10025" id="N10025"></a><a name="Setting your CLASSPATH"></a>
+<h2 class="boxed">Setting your CLASSPATH</h2>
+<div class="section">
+<p>First, you should <a href=
+"http://www.apache.org/dyn/closer.cgi/lucene/java/">download</a> the latest
+Lucene distribution and then extract it to a working directory.</p>
+<p>You need three JARs: the Lucene JAR, the common analysis JAR, and the Lucene
+demo JAR. You should see the Lucene JAR file in the core/ directory you created
+when you extracted the archive -- it should be named something like
+<span class="codefrag">lucene-core-{version}.jar</span>. You should also see
+files called <span class=
+"codefrag">lucene-analyzers-common-{version}.jar</span> and <span class=
+"codefrag">lucene-demo-{version}.jar</span> under analysis/common/ and demo/,
+respectively.</p>
+<p>Put all three of these files in your Java CLASSPATH.</p>
+</div>
+<a name="N10041" id="N10041"></a><a name="Indexing Files"></a>
+<h2 class="boxed">Indexing Files</h2>
+<div class="section">
+<p>Once you've gotten this far you're probably itching to go. Let's <b>build an
+index!</b> Assuming you've set your CLASSPATH correctly, just type:</p>
+<pre>
+    java org.apache.lucene.demo.IndexFiles -docs {path-to-lucene}/src
+</pre>
+This will produce a subdirectory called <span class="codefrag">index</span>
+which will contain an index of all of the Lucene source code.
+<p>To <b>search the index</b> type:</p>
+<pre>
+    java org.apache.lucene.demo.SearchFiles
+</pre>
+You'll be prompted for a query. Type in a swear word and press the enter key.
+You'll see that the Lucene developers are very well mannered and get no
+results. Now try entering the word "string". That should return a whole bunch
+of documents. The results will page at every tenth result and ask you whether
+you want more results.</div>
+<a name="N1005C" id="N1005C"></a><a name="About the code..."></a>
+<h2 class="boxed">About the code...</h2>
+<div class="section">
+<p><a href="demo2.html">read on&gt;&gt;&gt;</a></p>
+</div>
+</body>
+</html>

Added: lucene/dev/trunk/lucene/site/html/demo2.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/site/html/demo2.html?rev=1328748&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/site/html/demo2.html (added)
+++ lucene/dev/trunk/lucene/site/html/demo2.html Sun Apr 22 00:07:15 2012
@@ -0,0 +1,148 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title>Apache Lucene - Basic Demo Sources Walk-through</title>
+</head>
+<body>
+<h1>Apache Lucene - Basic Demo Sources Walk-through</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li><a href="#About%20the%20Code">About the Code</a></li>
+<li><a href="#Location%20of%20the%20source">Location of the source</a></li>
+<li><a href="#IndexFiles">IndexFiles</a></li>
+<li><a href="#Searching%20Files">Searching Files</a></li>
+</ul>
+</div>
+<a name="N10013" id="N10013"></a><a name="About the Code"></a>
+<h2 class="boxed">About the Code</h2>
+<div class="section">
+<p>In this section we walk through the sources behind the command-line Lucene
+demo: where to find them, their parts and their function. This section is
+intended for Java developers wishing to understand how to use Lucene in their
+applications.</p>
+</div>
+<a name="N1001C" id="N1001C"></a><a name="Location of the source"></a>
+<h2 class="boxed">Location of the source</h2>
+<div class="section">
+<p>NOTE: to examine the sources, you need to download and extract a source
+checkout of Lucene: (lucene-{version}-src.zip).</p>
+<p>Relative to the directory created when you extracted Lucene, you should see
+a directory called <span class="codefrag">lucene/demo/</span>. This is the root
+for the Lucene demo. Under this directory is <span class=
+"codefrag">src/java/org/apache/lucene/demo/</span>. This is where all the Java
+sources for the demo live.</p>
+<p>Within this directory you should see the <span class=
+"codefrag">IndexFiles.java</span> class we executed earlier. Bring it up in
+<span class="codefrag">vi</span> or your editor of choice and let's take a look
+at it.</p>
+</div>
+<a name="N10037" id="N10037"></a><a name="IndexFiles" id="IndexFiles"></a>
+<h2 class="boxed">IndexFiles</h2>
+<div class="section">
+<p>As we discussed in the previous walk-through, the <a href=
+"api/demo/org/apache/lucene/demo/IndexFiles.html">IndexFiles</a> class creates
+a Lucene Index. Let's take a look at how it does this.</p>
+<p>The <span class="codefrag">main()</span> method parses the command-line
+parameters, then in preparation for instantiating <a href=
+"api/core/org/apache/lucene/index/IndexWriter.html">IndexWriter</a>, opens a
+<a href="api/core/org/apache/lucene/store/Directory.html">Directory</a> and
+instantiates <a href=
+"api/analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">
+StandardAnalyzer</a> and <a href=
+"api/core/org/apache/lucene/index/IndexWriterConfig.html">IndexWriterConfig</a>.</p>
+<p>The value of the <span class="codefrag">-index</span> command-line parameter
+is the name of the filesystem directory where all index information should be
+stored. If <span class="codefrag">IndexFiles</span> is invoked with a relative
+path given in the <span class="codefrag">-index</span> command-line parameter,
+or if the <span class="codefrag">-index</span> command-line parameter is not
+given, causing the default relative index path "<span class=
+"codefrag">index</span>" to be used, the index path will be created as a
+subdirectory of the current working directory (if it does not already exist).
+On some platforms, the index path may be created in a different directory (such
+as the user's home directory).</p>
+<p>The <span class="codefrag">-docs</span> command-line parameter value is the
+location of the directory containing files to be indexed.</p>
+<p>The <span class="codefrag">-update</span> command-line parameter tells
+<span class="codefrag">IndexFiles</span> not to delete the index if it already
+exists. When <span class="codefrag">-update</span> is not given, <span class=
+"codefrag">IndexFiles</span> will first wipe the slate clean before indexing
+any documents.</p>
+<p>Lucene <a href=
+"api/core/org/apache/lucene/store/Directory.html">Directory</a>s are used by
+the <span class="codefrag">IndexWriter</span> to store information in the
+index. In addition to the <a href=
+"api/core/org/apache/lucene/store/FSDirectory.html">FSDirectory</a>
+implementation we are using, there are several other <span class=
+"codefrag">Directory</span> subclasses that can write to RAM, to databases,
+etc.</p>
+<p>Lucene <a href=
+"api/core/org/apache/lucene/analysis/Analyzer.html">Analyzer</a>s are
+processing pipelines that break up text into indexed tokens, a.k.a. terms, and
+optionally perform other operations on these tokens, e.g. downcasing, synonym
+insertion, filtering out unwanted tokens, etc. The <span class=
+"codefrag">Analyzer</span> we are using is <span class=
+"codefrag">StandardAnalyzer</span>, which creates tokens using the Word Break
+rules from the Unicode Text Segmentation algorithm specified in <a href=
+"http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>; converts
+tokens to lowercase; and then filters out stopwords. Stopwords are common
+language words such as articles (a, an, the, etc.) and other tokens that may
+have less value for searching. It should be noted that there are different
+rules for every language, and you should use the proper analyzer for each.
+Lucene currently provides Analyzers for a number of different languages (see
+the javadocs under <a href=
+"api/analyzers-common/org/apache/lucene/analysis/">lucene/analysis/common/src/java/org/apache/lucene/analysis</a>).</p>
+<p>The <span class="codefrag">IndexWriterConfig</span> instance holds all
+configuration for <span class="codefrag">IndexWriter</span>. For example, we
+set the <span class="codefrag">OpenMode</span> to use here based on the value
+of the <span class="codefrag">-update</span> command-line parameter.</p>
+<p>Looking further down in the file, after <span class=
+"codefrag">IndexWriter</span> is instantiated, you should see the <span class=
+"codefrag">indexDocs()</span> code. This recursive function crawls the
+directories and creates <a href=
+"api/core/org/apache/lucene/document/Document.html">Document</a> objects. The
+<span class="codefrag">Document</span> is simply a data object to represent the
+text content from the file as well as its creation time and location. These
+instances are added to the <span class="codefrag">IndexWriter</span>. If the
+<span class="codefrag">-update</span> command-line parameter is given, the
+<span class="codefrag">IndexWriter</span> <span class=
+"codefrag">OpenMode</span> will be set to <span class=
+"codefrag">OpenMode.CREATE_OR_APPEND</span>, and rather than adding documents
+to the index, the <span class="codefrag">IndexWriter</span> will
+<strong>update</strong> them in the index by attempting to find an
+already-indexed document with the same identifier (in our case, the file path
+serves as the identifier); deleting it from the index if it exists; and then
+adding the new document to the index.</p>
+</div>
+<a name="N100DB" id="N100DB"></a><a name="Searching Files"></a>
+<h2 class="boxed">Searching Files</h2>
+<div class="section">
+<p>The <a href=
+"api/demo/org/apache/lucene/demo/SearchFiles.html">SearchFiles</a> class is
+quite simple. It primarily collaborates with an <a href=
+"api/core/org/apache/lucene/search/IndexSearcher.html">IndexSearcher</a>,
+<a href=
+"api/analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">
+StandardAnalyzer</a> (which is used in the <a href=
+"api/demo/org/apache/lucene/demo/IndexFiles.html">IndexFiles</a> class as well)
+and a <a href=
+"api/core/org/apache/lucene/queryParser/QueryParser.html">QueryParser</a>. The
+query parser is constructed with an analyzer used to interpret your query text
+in the same way the documents are interpreted: finding word boundaries,
+downcasing, and removing useless words like 'a', 'an' and 'the'. The <a href=
+"api/core/org/apache/lucene/search/Query.html">Query</a> object contains the
+results from the <a href=
+"api/core/org/apache/lucene/queryParser/QueryParser.html">QueryParser</a> which
+is passed to the searcher. Note that it's also possible to programmatically
+construct a rich <a href=
+"api/core/org/apache/lucene/search/Query.html">Query</a> object without using
+the query parser. The query parser just enables decoding the <a href=
+"queryparsersyntax.html">Lucene query syntax</a> into the corresponding
+<a href="api/core/org/apache/lucene/search/Query.html">Query</a> object.</p>
+<p><span class="codefrag">SearchFiles</span> uses the <span class=
+"codefrag">IndexSearcher.search(query,n)</span> method that returns <a href=
+"api/core/org/apache/lucene/search/TopDocs.html">TopDocs</a> with max
+<span class="codefrag">n</span> hits. The results are printed in pages, sorted
+by score (i.e. relevance).</p>
+</div>
+</body>
+</html>

Added: lucene/dev/trunk/lucene/site/html/fileformats.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/site/html/fileformats.html?rev=1328748&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/site/html/fileformats.html (added)
+++ lucene/dev/trunk/lucene/site/html/fileformats.html Sun Apr 22 00:07:15 2012
@@ -0,0 +1,1120 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title>Apache Lucene - Index File Formats</title>
+
+<style type="text/css">
+ p.c6 {margin-left: -0.47cm; margin-right: 0.01cm; text-align: right}
+ p.c5 {margin-left: -0.07cm; margin-right: 0.01cm; text-align: right}
+ p.c4 {margin-left: 0.11cm; margin-right: 0.01cm; text-align: right}
+ p.c3 {text-align: right}
+ p.c2 {font-weight: bold; text-align: right}
+ p.c1 {font-weight: bold}
+</style>
+</head>
+<body>
+<h1>Apache Lucene - Index File Formats</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li><a href="#Index%20File%20Formats">Index File Formats</a></li>
+<li><a href="#Definitions">Definitions</a>
+<ul class="minitoc">
+<li><a href="#Inverted%20Indexing">Inverted Indexing</a></li>
+<li><a href="#Types%20of%20Fields">Types of Fields</a></li>
+<li><a href="#Segments">Segments</a></li>
+<li><a href="#Document%20Numbers">Document Numbers</a></li>
+</ul>
+</li>
+<li><a href="#Overview">Overview</a></li>
+<li><a href="#File%20Naming">File Naming</a></li>
+<li><a href="#file-names">Summary of File Extensions</a></li>
+<li><a href="#Primitive%20Types">Primitive Types</a>
+<ul class="minitoc">
+<li><a href="#Byte">Byte</a></li>
+<li><a href="#UInt32">UInt32</a></li>
+<li><a href="#Uint64">Uint64</a></li>
+<li><a href="#VInt">VInt</a></li>
+<li><a href="#Chars">Chars</a></li>
+<li><a href="#String">String</a></li>
+</ul>
+</li>
+<li><a href="#Compound%20Types">Compound Types</a>
+<ul class="minitoc">
+<li><a href="#MapStringString">Map&lt;String,String&gt;</a></li>
+</ul>
+</li>
+<li><a href="#Per-Index%20Files">Per-Index Files</a>
+<ul class="minitoc">
+<li><a href="#Segments%20File">Segments File</a></li>
+<li><a href="#Lock%20File">Lock File</a></li>
+<li><a href="#Deletable%20File">Deletable File</a></li>
+<li><a href="#Compound%20Files">Compound Files</a></li>
+</ul>
+</li>
+<li><a href="#Per-Segment%20Files">Per-Segment Files</a>
+<ul class="minitoc">
+<li><a href="#Fields">Fields</a></li>
+<li><a href="#Term%20Dictionary">Term Dictionary</a></li>
+<li><a href="#Frequencies">Frequencies</a></li>
+<li><a href="#Positions">Positions</a></li>
+<li><a href="#Normalization%20Factors">Normalization Factors</a></li>
+<li><a href="#Term%20Vectors">Term Vectors</a></li>
+<li><a href="#Deleted%20Documents">Deleted Documents</a></li>
+</ul>
+</li>
+<li><a href="#Limitations">Limitations</a></li>
+</ul>
+</div>
+<a name="N1000C" id="N1000C"></a><a name="Index File Formats"></a>
+<h2 class="boxed">Index File Formats</h2>
+<div class="section">
+<p>This document defines the index file formats used in this version of Lucene.
+If you are using a different version of Lucene, please consult the copy of
+<span class="codefrag">docs/fileformats.html</span> that was distributed with
+the version you are using.</p>
+<p>Apache Lucene is written in Java, but several efforts are underway to write
+<a href="http://wiki.apache.org/lucene-java/LuceneImplementations">versions of
+Lucene in other programming languages</a>. If these versions are to remain
+compatible with Apache Lucene, then a language-independent definition of the
+Lucene index format is required. This document thus attempts to provide a
+complete and independent definition of the Apache Lucene file formats.</p>
+<p>As Lucene evolves, this document should evolve. Versions of Lucene in
+different programming languages should endeavor to agree on file formats, and
+generate new versions of this document.</p>
+<p>Compatibility notes are provided in this document, describing how file
+formats have changed from prior versions.</p>
+<p>In version 2.1, the file format was changed to allow lock-less commits (ie,
+no more commit lock). The change is fully backwards compatible: you can open a
+pre-2.1 index for searching or adding/deleting of docs. When the new segments
+file is saved (committed), it will be written in the new file format (meaning
+no specific "upgrade" process is needed). But note that once a commit has
+occurred, pre-2.1 Lucene will not be able to read the index.</p>
+<p>In version 2.3, the file format was changed to allow segments to share a
+single set of doc store (vectors &amp; stored fields) files. This allows for
+faster indexing in certain cases. The change is fully backwards compatible (in
+the same way as the lock-less commits change in 2.1).</p>
+<p>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
+Java's modified UTF-8. See issue LUCENE-510 for details.</p>
+<p>In version 2.9, an optional opaque Map&lt;String,String&gt; CommitUserData
+may be passed to IndexWriter's commit methods (and later retrieved), which is
+recorded in the segments_N file. See issue LUCENE-1382 for details. Also,
+diagnostics were added to each segment written recording details about why it
+was written (due to flush, merge; which OS/JRE was used; etc.). See issue
+LUCENE-1654 for details.</p>
+<p>In version 3.0, compressed fields are no longer written to the index (they
+can still be read, but on merge the new segment will write them, uncompressed).
+See issue LUCENE-1960 for details.</p>
+<p>In version 3.1, segments records the code version that created them. See
+LUCENE-2720 for details. Additionally segments track explicitly whether or not
+they have term vectors. See LUCENE-2811 for details.</p>
+<p>In version 3.2, numeric fields are written as natively to stored fields
+file, previously they were stored in text format only.</p>
+<p>In version 3.4, fields can omit position data while still indexing term
+frequencies.</p>
+</div>
+<a name="N1003D" id="N1003D"></a><a name="Definitions" id="Definitions"></a>
+<h2 class="boxed">Definitions</h2>
+<div class="section">
+<p>The fundamental concepts in Lucene are index, document, field and term.</p>
+<p>An index contains a sequence of documents.</p>
+<ul>
+<li>
+<p>A document is a sequence of fields.</p>
+</li>
+<li>
+<p>A field is a named sequence of terms.</p>
+</li>
+<li>A term is a string.</li>
+</ul>
+<p>The same string in two different fields is considered a different term. Thus
+terms are represented as a pair of strings, the first naming the field, and the
+second naming text within the field.</p>
+<a name="N1005D" id="N1005D"></a><a name="Inverted Indexing"></a>
+<h3 class="boxed">Inverted Indexing</h3>
+<p>The index stores statistics about terms in order to make term-based search
+more efficient. Lucene's index falls into the family of indexes known as an
+<i>inverted index.</i> This is because it can list, for a term, the documents
+that contain it. This is the inverse of the natural relationship, in which
+documents list terms.</p>
+<a name="N10069" id="N10069"></a><a name="Types of Fields"></a>
+<h3 class="boxed">Types of Fields</h3>
+<p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
+in the index literally, in a non-inverted manner. Fields that are inverted are
+called <i>indexed</i>. A field may be both stored and indexed.</p>
+<p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
+text of a field may be used literally as a term to be indexed. Most fields are
+tokenized, but sometimes it is useful for certain identifier fields to be
+indexed literally.</p>
+<p>See the <a href="api/core/org/apache/lucene/document/Field.html">Field</a>
+java docs for more information on Fields.</p>
+<a name="N10086" id="N10086"></a><a name="Segments" id="Segments"></a>
+<h3 class="boxed">Segments</h3>
+<p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
+Each segment is a fully independent index, which could be searched separately.
+Indexes evolve by:</p>
+<ol>
+<li>
+<p>Creating new segments for newly added documents.</p>
+</li>
+<li>
+<p>Merging existing segments.</p>
+</li>
+</ol>
+<p>Searches may involve multiple segments and/or multiple indexes, each index
+potentially composed of a set of segments.</p>
+<a name="N100A4" id="N100A4"></a><a name="Document Numbers"></a>
+<h3 class="boxed">Document Numbers</h3>
+<p>Internally, Lucene refers to documents by an integer <i>document number</i>.
+The first document added to an index is numbered zero, and each subsequent
+document added gets a number one greater than the previous.</p>
+<p><br></p>
+<p>Note that a document's number may change, so caution should be taken when
+storing these numbers outside of Lucene. In particular, numbers may change in
+the following situations:</p>
+<ul>
+<li>
+<p>The numbers stored in each segment are unique only within the segment, and
+must be converted before they can be used in a larger context. The standard
+technique is to allocate each segment a range of values, based on the range of
+numbers used in that segment. To convert a document number from a segment to an
+external value, the segment's <i>base</i> document number is added. To convert
+an external value back to a segment-specific value, the segment is identified
+by the range that the external value is in, and the segment's base value is
+subtracted. For example two five document segments might be combined, so that
+the first segment has a base value of zero, and the second of five. Document
+three from the second segment would have an external value of eight.</p>
+</li>
+<li>
+<p>When documents are deleted, gaps are created in the numbering. These are
+eventually removed as the index evolves through merging. Deleted documents are
+dropped when segments are merged. A freshly-merged segment thus has no gaps in
+its numbering.</p>
+</li>
+</ul>
+</div>
+<a name="N100CB" id="N100CB"></a><a name="Overview" id="Overview"></a>
+<h2 class="boxed">Overview</h2>
+<div class="section">
+<p>Each segment index maintains the following:</p>
+<ul>
+<li>
+<p>Field names. This contains the set of field names used in the index.</p>
+</li>
+<li>
+<p>Stored Field values. This contains, for each document, a list of
+attribute-value pairs, where the attributes are field names. These are used to
+store auxiliary information about the document, such as its title, url, or an
+identifier to access a database. The set of stored fields are what is returned
+for each hit when searching. This is keyed by document number.</p>
+</li>
+<li>
+<p>Term dictionary. A dictionary containing all of the terms used in all of the
+indexed fields of all of the documents. The dictionary also contains the number
+of documents which contain the term, and pointers to the term's frequency and
+proximity data.</p>
+</li>
+<li>
+<p>Term Frequency data. For each term in the dictionary, the numbers of all the
+documents that contain that term, and the frequency of the term in that
+document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)</p>
+</li>
+<li>
+<p>Term Proximity data. For each term in the dictionary, the positions that the
+term occurs in each document. Note that this will not exist if all fields in
+all documents omit position data.</p>
+</li>
+<li>
+<p>Normalization factors. For each field in each document, a value is stored
+that is multiplied into the score for hits on that field.</p>
+</li>
+<li>
+<p>Term Vectors. For each field in each document, the term vector (sometimes
+called document vector) may be stored. A term vector consists of term text and
+term frequency. To add Term Vectors to your index see the <a href=
+"api/core/org/apache/lucene/document/Field.html">Field</a> constructors</p>
+</li>
+<li>
+<p>Deleted documents. An optional file indicating which documents are
+deleted.</p>
+</li>
+</ul>
+<p>Details on each of these are provided in subsequent sections.</p>
+</div>
+<a name="N1010E" id="N1010E"></a><a name="File Naming"></a>
+<h2 class="boxed">File Naming</h2>
+<div class="section">
+<p>All files belonging to a segment have the same name with varying extensions.
+The extensions correspond to the different file formats described below. When
+using the Compound File format (default in 1.4 and greater) these files are
+collapsed into a single .cfs file (see below for details)</p>
+<p>Typically, all segments in an index are stored in a single directory,
+although this is not required.</p>
+<p>As of version 2.1 (lock-less commits), file names are never re-used (there
+is one exception, "segments.gen", see below). That is, when any file is saved
+to the Directory it is given a never before used filename. This is achieved
+using a simple generations approach. For example, the first segments file is
+segments_1, then segments_2, etc. The generation is a sequential long integer
+represented in alpha-numeric (base 36) form.</p>
+</div>
+<a name="N1011D" id="N1011D"></a><a name="file-names" id="file-names"></a>
+<h2 class="boxed">Summary of File Extensions</h2>
+<div class="section">
+<p>The following table summarizes the names and extensions of the files in
+Lucene:</p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+<tr>
+<th>Name</th>
+<th>Extension</th>
+<th>Brief Description</th>
+</tr>
+<tr>
+<td><a href="#Segments%20File">Segments File</a></td>
+<td>segments.gen, segments_N</td>
+<td>Stores information about segments</td>
+</tr>
+<tr>
+<td><a href="#Lock%20File">Lock File</a></td>
+<td>write.lock</td>
+<td>The Write lock prevents multiple IndexWriters from writing to the same
+file.</td>
+</tr>
+<tr>
+<td><a href="#Compound%20Files">Compound File</a></td>
+<td>.cfs</td>
+<td>An optional "virtual" file consisting of all the other index files for
+systems that frequently run out of file handles.</td>
+</tr>
+<tr>
+<td><a href="#Compound%20File">Compound File Entry table</a></td>
+<td>.cfe</td>
+<td>The "virtual" compound file's entry table holding all entries in the
+corresponding .cfs file (Since 3.4)</td>
+</tr>
+<tr>
+<td><a href="#Fields">Fields</a></td>
+<td>.fnm</td>
+<td>Stores information about the fields</td>
+</tr>
+<tr>
+<td><a href="#field_index">Field Index</a></td>
+<td>.fdx</td>
+<td>Contains pointers to field data</td>
+</tr>
+<tr>
+<td><a href="#field_data">Field Data</a></td>
+<td>.fdt</td>
+<td>The stored fields for documents</td>
+</tr>
+<tr>
+<td><a href="#tis">Term Infos</a></td>
+<td>.tis</td>
+<td>Part of the term dictionary, stores term info</td>
+</tr>
+<tr>
+<td><a href="#tii">Term Info Index</a></td>
+<td>.tii</td>
+<td>The index into the Term Infos file</td>
+</tr>
+<tr>
+<td><a href="#Frequencies">Frequencies</a></td>
+<td>.frq</td>
+<td>Contains the list of docs which contain each term along with frequency</td>
+</tr>
+<tr>
+<td><a href="#Positions">Positions</a></td>
+<td>.prx</td>
+<td>Stores position information about where a term occurs in the index</td>
+</tr>
+<tr>
+<td><a href="#Normalization%20Factors">Norms</a></td>
+<td>.nrm</td>
+<td>Encodes length and boost factors for docs and fields</td>
+</tr>
+<tr>
+<td><a href="#tvx">Term Vector Index</a></td>
+<td>.tvx</td>
+<td>Stores offset into the document data file</td>
+</tr>
+<tr>
+<td><a href="#tvd">Term Vector Documents</a></td>
+<td>.tvd</td>
+<td>Contains information about each document that has term vectors</td>
+</tr>
+<tr>
+<td><a href="#tvf">Term Vector Fields</a></td>
+<td>.tvf</td>
+<td>The field level info about term vectors</td>
+</tr>
+<tr>
+<td><a href="#Deleted%20Documents">Deleted Documents</a></td>
+<td>.del</td>
+<td>Info about what files are deleted</td>
+</tr>
+</table>
+</div>
+<a name="N10215" id="N10215"></a><a name="Primitive Types"></a>
+<h2 class="boxed">Primitive Types</h2>
+<div class="section"><a name="N1021A" id="N1021A"></a><a name="Byte" id=
+"Byte"></a>
+<h3 class="boxed">Byte</h3>
+<p>The most primitive type is an eight-bit byte. Files are accessed as
+sequences of bytes. All other data types are defined as sequences of bytes, so
+file formats are byte-order independent.</p>
+<a name="N10223" id="N10223"></a><a name="UInt32" id="UInt32"></a>
+<h3 class="boxed">UInt32</h3>
+<p>32-bit unsigned integers are written as four bytes, high-order bytes
+first.</p>
+<p>UInt32 --&gt; &lt;Byte&gt;<sup>4</sup></p>
+<a name="N10232" id="N10232"></a><a name="Uint64" id="Uint64"></a>
+<h3 class="boxed">Uint64</h3>
+<p>64-bit unsigned integers are written as eight bytes, high-order bytes
+first.</p>
+<p>UInt64 --&gt; &lt;Byte&gt;<sup>8</sup></p>
+<a name="N10241" id="N10241"></a><a name="VInt" id="VInt"></a>
+<h3 class="boxed">VInt</h3>
+<p>A variable-length format for positive integers is defined where the
+high-order bit of each byte indicates whether more bytes remain to be read. The
+low-order seven bits are appended as increasingly more significant bits in the
+resulting integer value. Thus values from zero to 127 may be stored in a single
+byte, values from 128 to 16,383 may be stored in two bytes, and so on.</p>
+<p class="c1">VInt Encoding Example</p>
+<table class="ForrestTable" cellspacing="0" cellpadding="4" border="0">
+<col width="64*">
+<col width="64*">
+<col width="64*">
+<col width="64*">
+<tr valign="top">
+<td width="25%">
+<p class="c2">Value</p>
+</td>
+<td width="25%">
+<p class="c2">First byte</p>
+</td>
+<td width="25%">
+<p class="c2">Second byte</p>
+</td>
+<td width="25%">
+<p class="c2">Third byte</p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="0" width="25%">
+<p class="c3">0</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="0" width="25%">
+<p class="western c4">00000000</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="1" width="25%">
+<p class="c3">1</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1" width="25%">
+<p class="western c4">00000001</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="2" width="25%">
+<p class="c3">2</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10" width="25%">
+<p class="western c4">00000010</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr>
+<td valign="top" width="25%">
+<p class="c3">...</p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="c4"><br></p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="127" width="25%">
+<p class="c3">127</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1111111" width="25%">
+<p class="western c4">01111111</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="128" width="25%">
+<p class="c3">128</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+<p class="western c4">10000000</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1" width="25%">
+<p class="western c5">00000001</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="129" width="25%">
+<p class="c3">129</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000001" width="25%">
+<p class="western c4">10000001</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1" width="25%">
+<p class="western c5">00000001</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="130" width="25%">
+<p class="c3">130</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000010" width="25%">
+<p class="western c4">10000010</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1" width="25%">
+<p class="western c5">00000001</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr>
+<td valign="top" width="25%">
+<p class="c3">...</p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="c4"><br></p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="16383" width="25%">
+<p class="c3">16,383</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="11111111" width="25%">
+<p class="western c4">11111111</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1111111" width="25%">
+<p class="western c5">01111111</p>
+</td>
+<td sdnum="1033;0;00000000" width="25%">
+<p class="c6"><br></p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="16384" width="25%">
+<p class="c3">16,384</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+<p class="western c4">10000000</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+<p class="western c5">10000000</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1" width="25%">
+<p class="western c6">00000001</p>
+</td>
+</tr>
+<tr valign="bottom">
+<td sdnum="1033;0;#,##0" sdval="16385" width="25%">
+<p class="c3">16,385</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000001" width="25%">
+<p class="western c4">10000001</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="10000000" width="25%">
+<p class="western c5">10000000</p>
+</td>
+<td sdnum="1033;0;00000000" sdval="1" width="25%">
+<p class="western c6">00000001</p>
+</td>
+</tr>
+<tr>
+<td valign="top" width="25%">
+<p class="c3">...</p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="western c4"><br></p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="western c5"><br></p>
+</td>
+<td sdnum="1033;0;00000000" valign="bottom" width="25%">
+<p class="western c6"><br></p>
+</td>
+</tr>
+</table>
+<p>This provides compression while still being efficient to decode.</p>
+<a name="N10526" id="N10526"></a><a name="Chars" id="Chars"></a>
+<h3 class="boxed">Chars</h3>
+<p>Lucene writes unicode character sequences as UTF-8 encoded bytes.</p>
+<a name="N1052F" id="N1052F"></a><a name="String" id="String"></a>
+<h3 class="boxed">String</h3>
+<p>Lucene writes strings as UTF-8 encoded bytes. First the length, in bytes, is
+written as a VInt, followed by the bytes.</p>
+<p>String --&gt; VInt, Chars</p>
+</div>
+<a name="N1053C" id="N1053C"></a><a name="Compound Types"></a>
+<h2 class="boxed">Compound Types</h2>
+<div class="section"><a name="N10541" id="N10541"></a><a name="MapStringString"
+id="MapStringString"></a>
+<h3 class="boxed">Map&lt;String,String&gt;</h3>
+<p>In a couple places Lucene stores a Map String-&gt;String.</p>
+<p>Map&lt;String,String&gt; --&gt;
+Count&lt;String,String&gt;<sup>Count</sup></p>
+</div>
+<a name="N10551" id="N10551"></a><a name="Per-Index Files"></a>
+<h2 class="boxed">Per-Index Files</h2>
+<div class="section">
+<p>The files in this section exist one-per-index.</p>
+<a name="N10559" id="N10559"></a><a name="Segments File"></a>
+<h3 class="boxed">Segments File</h3>
+<p>The active segments in the index are stored in the segment info file,
+<tt>segments_N</tt>. There may be one or more <tt>segments_N</tt> files in the
+index; however, the one with the largest generation is the active one (when
+older segments_N files are present it's because they temporarily cannot be
+deleted, or, a writer is in the process of committing, or a custom <a href=
+"api/core/org/apache/lucene/index/IndexDeletionPolicy.html">IndexDeletionPolicy</a>
+is in use). This file lists each segment by name, has details about the
+separate norms and deletion files, and also contains the size of each
+segment.</p>
+<p>As of 2.1, there is also a file <tt>segments.gen</tt>. This file contains
+the current generation (the <tt>_N</tt> in <tt>segments_N</tt>) of the index.
+This is used only as a fallback in case the current generation cannot be
+accurately determined by directory listing alone (as is the case for some NFS
+clients with time-based directory cache expiraation). This file simply contains
+an Int32 version header (SegmentInfos.FORMAT_LOCKLESS = -2), followed by the
+generation recorded as Int64, written twice.</p>
+<p><b>3.1</b> Segments --&gt; Format, Version, NameCounter, SegCount,
+&lt;SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment,
+DocStoreIsCompoundFile], HasSingleNormFile, NumField,
+NormGen<sup>NumField</sup>, IsCompoundFile, DeletionCount, HasProx,
+Diagnostics, HasVectors&gt;<sup>SegCount</sup>, CommitUserData, Checksum</p>
+<p>Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset,
+DeletionCount --&gt; Int32</p>
+<p>Version, DelGen, NormGen, Checksum --&gt; Int64</p>
+<p>SegVersion, SegName, DocStoreSegment --&gt; String</p>
+<p>Diagnostics --&gt; Map&lt;String,String&gt;</p>
+<p>IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile, HasProx,
+HasVectors --&gt; Int8</p>
+<p>CommitUserData --&gt; Map&lt;String,String&gt;</p>
+<p>Format is -9 (SegmentInfos.FORMAT_DIAGNOSTICS).</p>
+<p>Version counts how often the index has been changed by adding or deleting
+documents.</p>
+<p>NameCounter is used to generate names for new segment files.</p>
+<p>SegVersion is the code version that created the segment.</p>
+<p>SegName is the name of the segment, and is used as the file name prefix for
+all of the files that compose the segment's index.</p>
+<p>SegSize is the number of documents contained in the segment index.</p>
+<p>DelGen is the generation count of the separate deletes file. If this is -1,
+there are no separate deletes. If it is 0, this is a pre-2.1 segment and you
+must check filesystem for the existence of _X.del. Anything above zero means
+there are separate deletes (_X_N.del).</p>
+<p>NumField is the size of the array for NormGen, or -1 if there are no
+NormGens stored.</p>
+<p>NormGen records the generation of the separate norms files. If NumField is
+-1, there are no normGens stored and they are all assumed to be 0 when the
+segment file was written pre-2.1 and all assumed to be -1 when the segments
+file is 2.1 or above. The generation then has the same meaning as delGen
+(above).</p>
+<p>IsCompoundFile records whether the segment is written as a compound file or
+not. If this is -1, the segment is not a compound file. If it is 1, the segment
+is a compound file. Else it is 0, which means we check filesystem to see if
+_X.cfs exists.</p>
+<p>If HasSingleNormFile is 1, then the field norms are written as a single
+joined file (with extension <tt>.nrm</tt>); if it is 0 then each field's norms
+are stored as separate <tt>.fN</tt> files. See "Normalization Factors" below
+for details.</p>
+<p>DocStoreOffset, DocStoreSegment, DocStoreIsCompoundFile: If DocStoreOffset
+is -1, this segment has its own doc store (stored fields values and term
+vectors) files and DocStoreSegment and DocStoreIsCompoundFile are not stored.
+In this case all files for stored field values (<tt>*.fdt</tt> and
+<tt>*.fdx</tt>) and term vectors (<tt>*.tvf</tt>, <tt>*.tvd</tt> and
+<tt>*.tvx</tt>) will be stored with this segment. Otherwise, DocStoreSegment is
+the name of the segment that has the shared doc store files;
+DocStoreIsCompoundFile is 1 if that segment is stored in compound file format
+(as a <tt>.cfx</tt> file); and DocStoreOffset is the starting document in the
+shared doc store files where this segment's documents begin. In this case, this
+segment does not store its own doc store files but instead shares a single set
+of these files with other segments.</p>
+<p>Checksum contains the CRC32 checksum of all bytes in the segments_N file up
+until the checksum. This is used to verify integrity of the file on opening the
+index.</p>
+<p>DeletionCount records the number of deleted documents in this segment.</p>
+<p>HasProx is 1 if any fields in this segment have position data
+(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); else, it's 0.</p>
+<p>CommitUserData stores an optional user-supplied opaque
+Map&lt;String,String&gt; that was passed to IndexWriter's commit or
+prepareCommit, or IndexReader's flush methods.</p>
+<p>The Diagnostics Map is privately written by IndexWriter, as a debugging aid,
+for each segment it creates. It includes metadata like the current Lucene
+version, OS, Java version, why the segment was created (merge, flush,
+addIndexes), etc.</p>
+<p>HasVectors is 1 if this segment stores term vectors, else it's 0.</p>
+<a name="N105E4" id="N105E4"></a><a name="Lock File"></a>
+<h3 class="boxed">Lock File</h3>
+<p>The write lock, which is stored in the index directory by default, is named
+"write.lock". If the lock directory is different from the index directory then
+the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
+derived from the full path to the index directory. When this file is present, a
+writer is currently modifying the index (adding or removing documents). This
+lock file ensures that only one writer is modifying the index at a time.</p>
+<a name="N105ED" id="N105ED"></a><a name="Deletable File"></a>
+<h3 class="boxed">Deletable File</h3>
+<p>A writer dynamically computes the files that are deletable, instead, so no
+file is written.</p>
+<a name="N105F6" id="N105F6"></a><a name="Compound Files"></a>
+<h3 class="boxed">Compound Files</h3>
+<p>Starting with Lucene 1.4 the compound file format became default. This is
+simply a container for all files described in the next section (except for the
+.del file).</p>
+<p>Compound Entry Table (.cfe) --&gt; Version, FileCount, &lt;FileName,
+DataOffset, DataLength&gt; <sup>FileCount</sup></p>
+<p>Compound (.cfs) --&gt; FileData <sup>FileCount</sup></p>
+<p>Version --&gt; Int</p>
+<p>FileCount --&gt; VInt</p>
+<p>DataOffset --&gt; Long</p>
+<p>DataLength --&gt; Long</p>
+<p>FileName --&gt; String</p>
+<p>FileData --&gt; raw file data</p>
+<p>The raw file data is the data from the individual files named above.</p>
+<p>Starting with Lucene 2.3, doc store files (stored field values and term
+vectors) can be shared in a single set of files for more than one segment. When
+compound file is enabled, these shared files will be added into a single
+compound file (same format as above) but with the extension <tt>.cfx</tt>.</p>
+</div>
+<a name="N10627" id="N10627"></a><a name="Per-Segment Files"></a>
+<h2 class="boxed">Per-Segment Files</h2>
+<div class="section">
+<p>The remaining files are all per-segment, and are thus defined by suffix.</p>
+<a name="N1062F" id="N1062F"></a><a name="Fields" id="Fields"></a>
+<h3 class="boxed">Fields</h3>
+<p><br>
+<b>Field Info</b><br></p>
+<p>Field names are stored in the field info file, with suffix .fnm.</p>
+<p>FieldInfos (.fnm) --&gt; FNMVersion,FieldsCount, &lt;FieldName,
+FieldBits&gt; <sup>FieldsCount</sup></p>
+<p>FNMVersion, FieldsCount --&gt; VInt</p>
+<p>FieldName --&gt; String</p>
+<p>FieldBits --&gt; Byte</p>
+<ul>
+<li>The low-order bit is one for indexed fields, and zero for non-indexed
+fields.</li>
+<li>The second lowest-order bit is one for fields that have term vectors
+stored, and zero for fields without term vectors.</li>
+<li>If the fifth lowest-order bit is set (0x10), norms are omitted for the
+indexed field.</li>
+<li>If the sixth lowest-order bit is set (0x20), payloads are stored for the
+indexed field.</li>
+<li>If the seventh lowest-order bit is set (0x40), term frequencies and
+positions omitted for the indexed field.</li>
+<li>If the eighth lowest-order bit is set (0x80), positions are omitted for the
+indexed field.</li>
+</ul>
+<p>FNMVersion (added in 2.9) is -2 for indexes from 2.9 - 3.3. It is -3 for
+indexes in Lucene 3.4+</p>
+<p>Fields are numbered by their order in this file. Thus field zero is the
+first field in the file, field one the next, and so on. Note that, like
+document numbers, field numbers are segment relative.</p>
+<p><br>
+<b>Stored Fields</b><br></p>
+<p>Stored fields are represented by two files:</p>
+<ol>
+<li><a name="field_index" id="field_index"></a>
+<p>The field index, or .fdx file.</p>
+<p>This contains, for each document, a pointer to its field data, as
+follows:</p>
+<p>FieldIndex (.fdx) --&gt; &lt;FieldValuesPosition&gt; <sup>SegSize</sup></p>
+<p>FieldValuesPosition --&gt; Uint64</p>
+<p>This is used to find the location within the field data file of the fields
+of a particular document. Because it contains fixed-length data, this file may
+be easily randomly accessed. The position of document <i>n</i> 's field data is
+the Uint64 at <i>n*8</i> in this file.</p>
+</li>
+<li>
+<p><a name="field_data" id="field_data"></a> The field data, or .fdt file.</p>
+<p>This contains the stored fields of each document, as follows:</p>
+<p>FieldData (.fdt) --&gt; &lt;DocFieldData&gt; <sup>SegSize</sup></p>
+<p>DocFieldData --&gt; FieldCount, &lt;FieldNum, Bits, Value&gt;
+<sup>FieldCount</sup></p>
+<p>FieldCount --&gt; VInt</p>
+<p>FieldNum --&gt; VInt</p>
+<p>Bits --&gt; Byte</p>
+<ul>
+<li>low order bit is one for tokenized fields</li>
+<li>second bit is one for fields containing binary data</li>
+<li>third bit is one for fields with compression option enabled (if compression
+is enabled, the algorithm used is ZLIB), only available for indexes until
+Lucene version 2.9.x</li>
+<li>4th to 6th bit (mask: 0x7&lt;&lt;3) define the type of a numeric field:
+<ul>
+<li>all bits in mask are cleared if no numeric field at all</li>
+<li>1&lt;&lt;3: Value is Int</li>
+<li>2&lt;&lt;3: Value is Long</li>
+<li>3&lt;&lt;3: Value is Int as Float (as of Float.intBitsToFloat)</li>
+<li>4&lt;&lt;3: Value is Long as Double (as of Double.longBitsToDouble)</li>
+</ul>
+</li>
+</ul>
+<p>Value --&gt; String | BinaryValue | Int | Long (depending on Bits)</p>
+<p>BinaryValue --&gt; ValueSize, &lt;Byte&gt;^ValueSize</p>
+<p>ValueSize --&gt; VInt</p>
+</li>
+</ol>
+<a name="N106EA" id="N106EA"></a><a name="Term Dictionary"></a>
+<h3 class="boxed">Term Dictionary</h3>
+<p>The term dictionary is represented as two files:</p>
+<ol>
+<li><a name="tis" id="tis"></a>
+<p>The term infos, or tis file.</p>
+<p>TermInfoFile (.tis)--&gt; TIVersion, TermCount, IndexInterval, SkipInterval,
+MaxSkipLevels, TermInfos</p>
+<p>TIVersion --&gt; UInt32</p>
+<p>TermCount --&gt; UInt64</p>
+<p>IndexInterval --&gt; UInt32</p>
+<p>SkipInterval --&gt; UInt32</p>
+<p>MaxSkipLevels --&gt; UInt32</p>
+<p>TermInfos --&gt; &lt;TermInfo&gt; <sup>TermCount</sup></p>
+<p>TermInfo --&gt; &lt;Term, DocFreq, FreqDelta, ProxDelta, SkipDelta&gt;</p>
+<p>Term --&gt; &lt;PrefixLength, Suffix, FieldNum&gt;</p>
+<p>Suffix --&gt; String</p>
+<p>PrefixLength, DocFreq, FreqDelta, ProxDelta, SkipDelta<br>
+--&gt; VInt</p>
+<p>This file is sorted by Term. Terms are ordered first lexicographically (by
+UTF16 character code) by the term's field name, and within that
+lexicographically (by UTF16 character code) by the term's text.</p>
+<p>TIVersion names the version of the format of this file and is equal to
+TermInfosWriter.FORMAT_CURRENT.</p>
+<p>Term text prefixes are shared. The PrefixLength is the number of initial
+characters from the previous term which must be pre-pended to a term's suffix
+in order to form the term's text. Thus, if the previous term's text was "bone"
+and the term is "boy", the PrefixLength is two and the suffix is "y".</p>
+<p>FieldNumber determines the term's field, whose name is stored in the .fdt
+file.</p>
+<p>DocFreq is the count of documents which contain the term.</p>
+<p>FreqDelta determines the position of this term's TermFreqs within the .frq
+file. In particular, it is the difference between the position of this term's
+data in that file and the position of the previous term's data (or zero, for
+the first term in the file).</p>
+<p>ProxDelta determines the position of this term's TermPositions within the
+.prx file. In particular, it is the difference between the position of this
+term's data in that file and the position of the previous term's data (or zero,
+for the first term in the file. For fields that omit position data, this will
+be 0 since prox information is not stored.</p>
+<p>SkipDelta determines the position of this term's SkipData within the .frq
+file. In particular, it is the number of bytes after TermFreqs that the
+SkipData starts. In other words, it is the length of the TermFreq data.
+SkipDelta is only stored if DocFreq is not smaller than SkipInterval.</p>
+</li>
+<li>
+<p><a name="tii" id="tii"></a> The term info index, or .tii file.</p>
+<p>This contains every IndexInterval <sup>th</sup> entry from the .tis file,
+along with its location in the "tis" file. This is designed to be read entirely
+into memory and used to provide random access to the "tis" file.</p>
+<p>The structure of this file is very similar to the .tis file, with the
+addition of one item per record, the IndexDelta.</p>
+<p>TermInfoIndex (.tii)--&gt; TIVersion, IndexTermCount, IndexInterval,
+SkipInterval, MaxSkipLevels, TermIndices</p>
+<p>TIVersion --&gt; UInt32</p>
+<p>IndexTermCount --&gt; UInt64</p>
+<p>IndexInterval --&gt; UInt32</p>
+<p>SkipInterval --&gt; UInt32</p>
+<p>TermIndices --&gt; &lt;TermInfo, IndexDelta&gt;
+<sup>IndexTermCount</sup></p>
+<p>IndexDelta --&gt; VLong</p>
+<p>IndexDelta determines the position of this term's TermInfo within the .tis
+file. In particular, it is the difference between the position of this term's
+entry in that file and the position of the previous term's entry.</p>
+<p>SkipInterval is the fraction of TermDocs stored in skip tables. It is used
+to accelerate TermDocs.skipTo(int). Larger values result in smaller indexes,
+greater acceleration, but fewer accelerable cases, while smaller values result
+in bigger indexes, less acceleration (in case of a small value for
+MaxSkipLevels) and more accelerable cases.</p>
+<p>MaxSkipLevels is the max. number of skip levels stored for each term in the
+.frq file. A low value results in smaller indexes but less acceleration, a
+larger value results in slighly larger indexes but greater acceleration. See
+format of .frq file for more information about skip levels.</p>
+</li>
+</ol>
+<a name="N1076E" id="N1076E"></a><a name="Frequencies" id="Frequencies"></a>
+<h3 class="boxed">Frequencies</h3>
+<p>The .frq file contains the lists of documents which contain each term, along
+with the frequency of the term in that document (except when frequencies are
+omitted: IndexOptions.DOCS_ONLY).</p>
+<p>FreqFile (.frq) --&gt; &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></p>
+<p>TermFreqs --&gt; &lt;TermFreq&gt; <sup>DocFreq</sup></p>
+<p>TermFreq --&gt; DocDelta[, Freq?]</p>
+<p>SkipData --&gt; &lt;&lt;SkipLevelLength, SkipLevel&gt;
+<sup>NumSkipLevels-1</sup>, SkipLevel&gt; &lt;SkipDatum&gt;</p>
+<p>SkipLevel --&gt; &lt;SkipDatum&gt; <sup>DocFreq/(SkipInterval^(Level +
+1))</sup></p>
+<p>SkipDatum --&gt;
+DocSkip,PayloadLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?</p>
+<p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --&gt; VInt</p>
+<p>SkipChildLevelPointer --&gt; VLong</p>
+<p>TermFreqs are ordered by term (the term is implicit, from the .tis
+file).</p>
+<p>TermFreq entries are ordered by increasing document number.</p>
+<p>DocDelta: if frequencies are indexed, this determines both the document
+number and the frequency. In particular, DocDelta/2 is the difference between
+this document number and the previous document number (or zero when this is the
+first document in a TermFreqs). When DocDelta is odd, the frequency is one.
+When DocDelta is even, the frequency is read as another VInt. If frequencies
+are omitted, DocDelta contains the gap (not multiplied by 2) between document
+numbers and no frequency information is stored.</p>
+<p>For example, the TermFreqs for a term which occurs once in document seven
+and three times in document eleven, with frequencies indexed, would be the
+following sequence of VInts:</p>
+<p>15, 8, 3</p>
+<p>If frequencies were omitted (IndexOptions.DOCS_ONLY) it would be this
+sequence of VInts instead:</p>
+<p>7,4</p>
+<p>DocSkip records the document number before every SkipInterval <sup>th</sup>
+document in TermFreqs. If payloads are disabled for the term's field, then
+DocSkip represents the difference from the previous value in the sequence. If
+payloads are enabled for the term's field, then DocSkip/2 represents the
+difference from the previous value in the sequence. If payloads are enabled and
+DocSkip is odd, then PayloadLength is stored indicating the length of the last
+payload before the SkipInterval<sup>th</sup> document in TermPositions.
+FreqSkip and ProxSkip record the position of every SkipInterval <sup>th</sup>
+entry in FreqFile and ProxFile, respectively. File positions are relative to
+the start of TermFreqs and Positions, to the previous SkipDatum in the
+sequence.</p>
+<p>For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData
+entries, containing the 15 <sup>th</sup> and 31 <sup>st</sup> document numbers
+in TermFreqs. The first FreqSkip names the number of bytes after the beginning
+of TermFreqs that the 16 <sup>th</sup> SkipDatum starts, and the second the
+number of bytes after that that the 32 <sup>nd</sup> starts. The first ProxSkip
+names the number of bytes after the beginning of Positions that the 16
+<sup>th</sup> SkipDatum starts, and the second the number of bytes after that
+that the 32 <sup>nd</sup> starts.</p>
+<p>Each term can have multiple skip levels. The amount of skip levels for a
+term is NumSkipLevels = Min(MaxSkipLevels,
+floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a
+skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level
+is Level=0.<br>
+Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0
+has 8 SkipData entries, containing the 3<sup>rd</sup>, 7<sup>th</sup>,
+11<sup>th</sup>, 15<sup>th</sup>, 19<sup>th</sup>, 23<sup>rd</sup>,
+27<sup>th</sup>, and 31<sup>st</sup> document numbers in TermFreqs. Skip level
+1 has 2 SkipData entries, containing the 15<sup>th</sup> and 31<sup>st</sup>
+document numbers in TermFreqs.<br>
+The SkipData entries on all upper levels &gt; 0 contain a SkipChildLevelPointer
+referencing the corresponding SkipData entry in level-1. In the example has
+entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a
+pointer to entry 31 on level 0.</p>
+<a name="N107F6" id="N107F6"></a><a name="Positions" id="Positions"></a>
+<h3 class="boxed">Positions</h3>
+<p>The .prx file contains the lists of positions that each term occurs at
+within documents. Note that fields omitting positional data do not store
+anything into this file, and if all fields in the index omit positional data
+then the .prx file will not exist.</p>
+<p>ProxFile (.prx) --&gt; &lt;TermPositions&gt; <sup>TermCount</sup></p>
+<p>TermPositions --&gt; &lt;Positions&gt; <sup>DocFreq</sup></p>
+<p>Positions --&gt; &lt;PositionDelta,Payload?&gt; <sup>Freq</sup></p>
+<p>Payload --&gt; &lt;PayloadLength?,PayloadData&gt;</p>
+<p>PositionDelta --&gt; VInt</p>
+<p>PayloadLength --&gt; VInt</p>
+<p>PayloadData --&gt; byte<sup>PayloadLength</sup></p>
+<p>TermPositions are ordered by term (the term is implicit, from the .tis
+file).</p>
+<p>Positions entries are ordered by increasing document number (the document
+number is implicit from the .frq file).</p>
+<p>PositionDelta is, if payloads are disabled for the term's field, the
+difference between the position of the current occurrence in the document and
+the previous occurrence (or zero, if this is the first occurrence in this
+document). If payloads are enabled for the term's field, then PositionDelta/2
+is the difference between the current and the previous position. If payloads
+are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
+the length of the payload at the current term position.</p>
+<p>For example, the TermPositions for a term which occurs as the fourth term in
+one document, and as the fifth and ninth term in a subsequent document, would
+be the following sequence of VInts (payloads disabled):</p>
+<p>4, 5, 4</p>
+<p>PayloadData is metadata associated with the current term position. If
+PayloadLength is stored at the current position, then it indicates the length
+of this Payload. If PayloadLength is not stored, then this Payload has the same
+length as the Payload at the previous position.</p>
+<a name="N10832" id="N10832"></a><a name="Normalization Factors"></a>
+<h3 class="boxed">Normalization Factors</h3>
+<p>There's a single .nrm file containing all norms:</p>
+<p>AllNorms (.nrm) --&gt; NormsHeader,&lt;Norms&gt;
+<sup>NumFieldsWithNorms</sup></p>
+<p>Norms --&gt; &lt;Byte&gt; <sup>SegSize</sup></p>
+<p>NormsHeader --&gt; 'N','R','M',Version</p>
+<p>Version --&gt; Byte</p>
+<p>NormsHeader has 4 bytes, last of which is the format version for this file,
+currently -1.</p>
+<p>Each byte encodes a floating point value. Bits 0-2 contain the 3-bit
+mantissa, and bits 3-8 contain the 5-bit exponent.</p>
+<p>These are converted to an IEEE single float value as follows:</p>
+<ol>
+<li>
+<p>If the byte is zero, use a zero float.</p>
+</li>
+<li>
+<p>Otherwise, set the sign bit of the float to zero;</p>
+</li>
+<li>
+<p>add 48 to the exponent and use this as the float's exponent;</p>
+</li>
+<li>
+<p>map the mantissa to the high-order 3 bits of the float's mantissa; and</p>
+</li>
+<li>
+<p>set the low-order 21 bits of the float's mantissa to zero.</p>
+</li>
+</ol>
+<p>A separate norm file is created when the norm values of an existing segment
+are modified. When field <em>N</em> is modified, a separate norm file
+<em>.sN</em> is created, to maintain the norm values for that field.</p>
+<p>Separate norm files are created (when adequate) for both compound and non
+compound segments.</p>
+<a name="N10883" id="N10883"></a><a name="Term Vectors"></a>
+<h3 class="boxed">Term Vectors</h3>
+<p>Term Vector support is an optional on a field by field basis. It consists of
+3 files.</p>
+<ol>
+<li><a name="tvx" id="tvx"></a>
+<p>The Document Index or .tvx file.</p>
+<p>For each document, this stores the offset into the document data (.tvd) and
+field data (.tvf) files.</p>
+<p>DocumentIndex (.tvx) --&gt; TVXVersion&lt;DocumentPosition,FieldPosition&gt;
+<sup>NumDocs</sup></p>
+<p>TVXVersion --&gt; Int (TermVectorsReader.CURRENT)</p>
+<p>DocumentPosition --&gt; UInt64 (offset in the .tvd file)</p>
+<p>FieldPosition --&gt; UInt64 (offset in the .tvf file)</p>
+</li>
+<li><a name="tvd" id="tvd"></a>
+<p>The Document or .tvd file.</p>
+<p>This contains, for each document, the number of fields, a list of the fields
+with term vector info and finally a list of pointers to the field information
+in the .tvf (Term Vector Fields) file.</p>
+<p>Document (.tvd) --&gt; TVDVersion&lt;NumFields, FieldNums,
+FieldPositions&gt; <sup>NumDocs</sup></p>
+<p>TVDVersion --&gt; Int (TermVectorsReader.FORMAT_CURRENT)</p>
+<p>NumFields --&gt; VInt</p>
+<p>FieldNums --&gt; &lt;FieldNumDelta&gt; <sup>NumFields</sup></p>
+<p>FieldNumDelta --&gt; VInt</p>
+<p>FieldPositions --&gt; &lt;FieldPositionDelta&gt; <sup>NumFields-1</sup></p>
+<p>FieldPositionDelta --&gt; VLong</p>
+<p>The .tvd file is used to map out the fields that have term vectors stored
+and where the field information is in the .tvf file.</p>
+</li>
+<li><a name="tvf" id="tvf"></a>
+<p>The Field or .tvf file.</p>
+<p>This file contains, for each field that has a term vector stored, a list of
+the terms, their frequencies and, optionally, position and offest
+information.</p>
+<p>Field (.tvf) --&gt; TVFVersion&lt;NumTerms, Position/Offset, TermFreqs&gt;
+<sup>NumFields</sup></p>
+<p>TVFVersion --&gt; Int (TermVectorsReader.FORMAT_CURRENT)</p>
+<p>NumTerms --&gt; VInt</p>
+<p>Position/Offset --&gt; Byte</p>
+<p>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, Offsets?&gt;
+<sup>NumTerms</sup></p>
+<p>TermText --&gt; &lt;PrefixLength, Suffix&gt;</p>
+<p>PrefixLength --&gt; VInt</p>
+<p>Suffix --&gt; String</p>
+<p>TermFreq --&gt; VInt</p>
+<p>Positions --&gt; &lt;VInt&gt;<sup>TermFreq</sup></p>
+<p>Offsets --&gt; &lt;VInt, VInt&gt;<sup>TermFreq</sup></p>
+<br>
+<p>Notes:</p>
+<ul>
+<li>Position/Offset byte stores whether this term vector has position or offset
+information stored.</li>
+<li>Term text prefixes are shared. The PrefixLength is the number of initial
+characters from the previous term which must be pre-pended to a term's suffix
+in order to form the term's text. Thus, if the previous term's text was "bone"
+and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
+<li>Positions are stored as delta encoded VInts. This means we only store the
+difference of the current position from the last position</li>
+<li>Offsets are stored as delta encoded VInts. The first VInt is the
+startOffset, the second is the endOffset.</li>
+</ul>
+</li>
+</ol>
+<a name="N1091F" id="N1091F"></a><a name="Deleted Documents"></a>
+<h3 class="boxed">Deleted Documents</h3>
+<p>The .del file is optional, and only exists when a segment contains
+deletions.</p>
+<p>Although per-segment, this file is maintained exterior to compound segment
+files.</p>
+<p>Deletions (.del) --&gt; [Format],ByteCount,BitCount, Bits | DGaps (depending
+on Format)</p>
+<p>Format,ByteSize,BitCount --&gt; Uint32</p>
+<p>Bits --&gt; &lt;Byte&gt; <sup>ByteCount</sup></p>
+<p>DGaps --&gt; &lt;DGap,NonzeroByte&gt; <sup>NonzeroBytesCount</sup></p>
+<p>DGap --&gt; VInt</p>
+<p>NonzeroByte --&gt; Byte</p>
+<p>Format is Optional. -1 indicates DGaps. Non-negative value indicates Bits,
+and that Format is excluded.</p>
+<p>ByteCount indicates the number of bytes in Bits. It is typically
+(SegSize/8)+1.</p>
+<p>BitCount indicates the number of bits that are currently set in Bits.</p>
+<p>Bits contains one bit for each document indexed. When the bit corresponding
+to a document number is set, that document is marked as deleted. Bit ordering
+is from least to most significant. Thus, if Bits contains two bytes, 0x00 and
+0x02, then document 9 is marked as deleted.</p>
+<p>DGaps represents sparse bit-vectors more efficiently than Bits. It is made
+of DGaps on indexes of nonzero bytes in Bits, and the nonzero bytes themselves.
+The number of nonzero bytes in Bits (NonzeroBytesCount) is not stored.</p>
+<p>For example, if there are 8000 bits and only bits 10,12,32 are set, DGaps
+would be used:</p>
+<p>(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1</p>
+</div>
+<a name="N10959" id="N10959"></a><a name="Limitations" id="Limitations"></a>
+<h2 class="boxed">Limitations</h2>
+<div class="section">
+<p>When referring to term numbers, Lucene's current implementation uses a Java
+<span class="codefrag">int</span> to hold the term index, which means the
+maximum number of unique terms in any single index segment is ~2.1 billion
+times the term index interval (default 128) = ~274 billion. This is technically
+not a limitation of the index file format, just of Lucene's current
+implementation.</p>
+<p>Similarly, Lucene uses a Java <span class="codefrag">int</span> to refer to
+document numbers, and the index file format uses an <span class=
+"codefrag">Int32</span> on-disk to store document numbers. This is a limitation
+of both the index file format and the current implementation. Eventually these
+should be replaced with either <span class="codefrag">UInt64</span> values, or
+better yet, <span class="codefrag">VInt</span> values which have no limit.</p>
+</div>
+</body>
+</html>

Added: lucene/dev/trunk/lucene/site/html/lucene_green_300.gif
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/site/html/lucene_green_300.gif?rev=1328748&view=auto
==============================================================================
Binary file - no diff available.