You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/01/17 18:23:44 UTC

svn commit: r1559196 [12/19] - in /lucene/dev/branches/lucene5376: ./ dev-tools/ dev-tools/idea/solr/contrib/morphlines-cell/ dev-tools/maven/lucene/facet/ lucene/ lucene/analysis/ lucene/analysis/common/ lucene/analysis/common/src/java/org/apache/luce...

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/Tokenizer.java Fri Jan 17 17:23:33 2014
@@ -37,21 +37,21 @@ public abstract class Tokenizer extends 
   /** Pending reader: not actually assigned to input until reset() */
   private Reader inputPending = ILLEGAL_STATE_READER;
 
-  /** Construct a token stream processing the given input. */
-  protected Tokenizer(Reader input) {
-    if (input == null) {
-      throw new NullPointerException("input must not be null");
-    }
-    this.inputPending = input;
+  /**
+   * Construct a tokenizer with no input, awaiting a call to {@link #setReader(java.io.Reader)}
+   * to provide input.
+   */
+  protected Tokenizer() {
+    //
   }
-  
-  /** Construct a token stream processing the given input using the given AttributeFactory. */
-  protected Tokenizer(AttributeFactory factory, Reader input) {
+
+  /**
+   * Construct a tokenizer with no input, awaiting a call to {@link #setReader(java.io.Reader)} to
+   * provide input.
+   * @param factory attribute factory.
+   */
+  protected Tokenizer(AttributeFactory factory) {
     super(factory);
-    if (input == null) {
-      throw new NullPointerException("input must not be null");
-    }
-    this.inputPending = input;
   }
 
   /**

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/package.html?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/package.html (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/analysis/package.html Fri Jan 17 17:23:33 2014
@@ -80,53 +80,67 @@ and proximity searches (though sentence 
 </p>
 <ul>
   <li>
-    {@link org.apache.lucene.analysis.Analyzer} &ndash; An Analyzer is 
-    responsible for building a 
+    {@link org.apache.lucene.analysis.Analyzer} &ndash; An <code>Analyzer</code> is 
+    responsible for supplying a
     {@link org.apache.lucene.analysis.TokenStream} which can be consumed
     by the indexing and searching processes.  See below for more information
-    on implementing your own Analyzer.
+    on implementing your own {@link org.apache.lucene.analysis.Analyzer}. Most of the time, you can use
+    an anonymous subclass of {@link org.apache.lucene.analysis.Analyzer}.
   </li>
   <li>
-    CharFilter &ndash; CharFilter extends
-    {@link java.io.Reader} to perform pre-tokenization substitutions, 
-    deletions, and/or insertions on an input Reader's text, while providing
+    {@link org.apache.lucene.analysis.CharFilter} &ndash; <code>CharFilter</code> extends
+    {@link java.io.Reader} to transform the text before it is
+    tokenized, while providing
     corrected character offsets to account for these modifications.  This
     capability allows highlighting to function over the original text when 
-    indexed tokens are created from CharFilter-modified text with offsets
-    that are not the same as those in the original text. Tokenizers'
-    constructors and reset() methods accept a CharFilter.  CharFilters may
+    indexed tokens are created from <code>CharFilter</code>-modified text with offsets
+    that are not the same as those in the original text. {@link org.apache.lucene.analysis.Tokenizer#setReader(java.io.Reader)}
+    accept <code>CharFilter</code>s.  <code>CharFilter</code>s may
     be chained to perform multiple pre-tokenization modifications.
   </li>
   <li>
-    {@link org.apache.lucene.analysis.Tokenizer} &ndash; A Tokenizer is a 
+    {@link org.apache.lucene.analysis.Tokenizer} &ndash; A <code>Tokenizer</code> is a 
     {@link org.apache.lucene.analysis.TokenStream} and is responsible for
-    breaking up incoming text into tokens. In most cases, an Analyzer will
-    use a Tokenizer as the first step in the analysis process.  However,
-    to modify text prior to tokenization, use a CharStream subclass (see
+    breaking up incoming text into tokens. In many cases, an {@link org.apache.lucene.analysis.Analyzer} will
+    use a {@link org.apache.lucene.analysis.Tokenizer} as the first step in the analysis process.  However,
+    to modify text prior to tokenization, use a {@link org.apache.lucene.analysis.CharFilter} subclass (see
     above).
   </li>
   <li>
-    {@link org.apache.lucene.analysis.TokenFilter} &ndash; A TokenFilter is
-    also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
-    for modifying tokens that have been created by the Tokenizer.  Common 
-    modifications performed by a TokenFilter are: deletion, stemming, synonym 
-    injection, and down casing.  Not all Analyzers require TokenFilters.
+    {@link org.apache.lucene.analysis.TokenFilter} &ndash; A <code>TokenFilter</code> is
+    a {@link org.apache.lucene.analysis.TokenStream} and is responsible
+    for modifying tokens that have been created by the <code>Tokenizer</code>. Common 
+    modifications performed by a <code>TokenFilter</code> are: deletion, stemming, synonym 
+    injection, and case folding.  Not all <code>Analyzer</code>s require <code>TokenFilter</code>s.
   </li>
 </ul>
 <h2>Hints, Tips and Traps</h2>
 <p>
-  The synergy between {@link org.apache.lucene.analysis.Analyzer} and 
-  {@link org.apache.lucene.analysis.Tokenizer} is sometimes confusing. To ease
-  this confusion, some clarifications:
+  The relationship between {@link org.apache.lucene.analysis.Analyzer} and 
+  {@link org.apache.lucene.analysis.CharFilter}s,
+  {@link org.apache.lucene.analysis.Tokenizer}s,
+  and {@link org.apache.lucene.analysis.TokenFilter}s is sometimes confusing. To ease
+  this confusion, here is some clarifications:
 </p>
 <ul>
   <li>
-    The {@link org.apache.lucene.analysis.Analyzer} is responsible for the entire task of 
-    <u>creating</u> tokens out of the input text, while the {@link org.apache.lucene.analysis.Tokenizer}
-    is only responsible for <u>breaking</u> the input text into tokens. Very likely, tokens created 
-    by the {@link org.apache.lucene.analysis.Tokenizer} would be modified or even omitted 
-    by the {@link org.apache.lucene.analysis.Analyzer} (via one or more
-    {@link org.apache.lucene.analysis.TokenFilter}s) before being returned.
+    The {@link org.apache.lucene.analysis.Analyzer} is a
+    <strong>factory</strong> for analysis chains. <code>Analyzer</code>s don't
+    process text, <code>Analyzer</code>s construct <code>CharFilter</code>s, <code>Tokenizer</code>s, and/or
+    <code>TokenFilter</code>s that process text. An <code>Analyzer</code> has two tasks: 
+    to produce {@link org.apache.lucene.analysis.TokenStream}s that accept a
+    reader and produces tokens, and to wrap or otherwise
+    pre-process {@link java.io.Reader} objects.
+  </li>
+  <li>
+  The {@link org.apache.lucene.analysis.CharFilter} is a subclass of
+ {@link java.io.Reader} that supports offset tracking.
+  </li>
+  <li>The{@link org.apache.lucene.analysis.Tokenizer}
+    is only responsible for <u>breaking</u> the input text into tokens.
+  </li>
+  <li>The{@link org.apache.lucene.analysis.TokenFilter} modifies a
+  stream of tokens and their contents.
   </li>
   <li>
     {@link org.apache.lucene.analysis.Tokenizer} is a {@link org.apache.lucene.analysis.TokenStream}, 
@@ -134,55 +148,68 @@ and proximity searches (though sentence 
   </li>
   <li>
     {@link org.apache.lucene.analysis.Analyzer} is "field aware", but 
-    {@link org.apache.lucene.analysis.Tokenizer} is not.
+    {@link org.apache.lucene.analysis.Tokenizer} is not. {@link org.apache.lucene.analysis.Analyzer}s may
+    take a field name into account when constructing the {@link org.apache.lucene.analysis.TokenStream}.
   </li>
 </ul>
 <p>
-  Lucene Java provides a number of analysis capabilities, the most commonly used one being the StandardAnalyzer.  
+  If you want to use a particular combination of <code>CharFilter</code>s, a
+  <code>Tokenizer</code>, and some <code>TokenFilter</code>s, the simplest thing is often an
+  create an anonymous subclass of {@link org.apache.lucene.analysis.Analyzer}, provide {@link
+  org.apache.lucene.analysis.Analyzer#createComponents(String)} and perhaps also
+  {@link org.apache.lucene.analysis.Analyzer#initReader(String,
+  java.io.Reader)}. However, if you need the same set of components
+  over and over in many places, you can make a subclass of
+  {@link org.apache.lucene.analysis.Analyzer}. In fact, Apache Lucene
+  supplies a large family of <code>Analyzer</code> classes that deliver useful
+  analysis chains. The most common of these is the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/standard/StandardAnalyzer.html">StandardAnalyzer</a>.
   Many applications will have a long and industrious life with nothing more
-  than the StandardAnalyzer.  However, there are a few other classes/packages that are worth mentioning:
+  than the <code>StandardAnalyzer</code>.
+</p>
+<p>
+  Aside from the <code>StandardAnalyzer</code>,
+  Lucene includes several components containing analysis components,
+  all under the 'analysis' directory of the distribution. Some of
+  these support particular languages, others integrate external
+  components. The 'common' subdirectory has some noteworthy
+ general-purpose analyzers, including the <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.html">PerFieldAnalyzerWrapper</a>. Most <code>Analyzer</code>s perform the same operation on all
+ {@link org.apache.lucene.document.Field}s.  The PerFieldAnalyzerWrapper can be used to associate a different <code>Analyzer</code> with different
+ {@link org.apache.lucene.document.Field}s. There is a great deal of
+ functionality in the analysis area, you should study it carefully to
+ find the pieces you need.
 </p>
-<ol>
-  <li>
-    PerFieldAnalyzerWrapper &ndash; Most Analyzers perform the same operation on all
-    {@link org.apache.lucene.document.Field}s.  The PerFieldAnalyzerWrapper can be used to associate a different Analyzer with different
-    {@link org.apache.lucene.document.Field}s.
-  </li>
-  <li>
-    The analysis library located at the root of the Lucene distribution has a number of different Analyzer implementations to solve a variety
-    of different problems related to searching.  Many of the Analyzers are designed to analyze non-English languages.
-  </li>
-  <li>
-    There are a variety of Tokenizer and TokenFilter implementations in this package.  Take a look around, chances are someone has implemented what you need.
-  </li>
-</ol>
 <p>
-  Analysis is one of the main causes of performance degradation during indexing.  Simply put, the more you analyze the slower the indexing (in most cases).
+  Analysis is one of the main causes of slow indexing.  Simply put, the more you analyze the slower the indexing (in most cases).
   Perhaps your application would be just fine using the simple WhitespaceTokenizer combined with a StopFilter. The benchmark/ library can be useful 
   for testing out the speed of the analysis process.
 </p>
 <h2>Invoking the Analyzer</h2>
 <p>
-  Applications usually do not invoke analysis &ndash; Lucene does it for them:
+  Applications usually do not invoke analysis &ndash; Lucene does it
+ for them. Applications construct <code>Analyzer</code>s and pass then into Lucene,
+ as follows:
 </p>
 <ul>
   <li>
     At indexing, as a consequence of 
     {@link org.apache.lucene.index.IndexWriter#addDocument(IndexDocument) addDocument(doc)},
-    the Analyzer in effect for indexing is invoked for each indexed field of the added document.
+    the <code>Analyzer</code> in effect for indexing is invoked for each indexed field of the added document.
   </li>
   <li>
-    At search, a QueryParser may invoke the Analyzer during parsing.  Note that for some queries, analysis does not
+    At search, a <code>QueryParser</code> may invoke the Analyzer during parsing.  Note that for some queries, analysis does not
     take place, e.g. wildcard queries.
   </li>
 </ul>
 <p>
   However an application might invoke Analysis of any text for testing or for any other purpose, something like:
 </p>
-<PRE class="prettyprint">
+<PRE class="prettyprint" id="analysis-workflow">
     Version matchVersion = Version.LUCENE_XY; // Substitute desired Lucene version for XY
     Analyzer analyzer = new StandardAnalyzer(matchVersion); // or any other analyzer
     TokenStream ts = analyzer.tokenStream("myfield", new StringReader("some text goes here"));
+    /* The Analyzer class will construct the Tokenizer, TokenFilter(s), and CharFilter(s),
+       and pass the resulting Reader to the Tokenizer.
+    */
     OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
     
     try {
@@ -204,27 +231,28 @@ and proximity searches (though sentence 
 <p>
   Selecting the "correct" analyzer is crucial
   for search quality, and can also affect indexing and search performance.
-  The "correct" analyzer differs between applications.
+  The "correct" analyzer for your application will depend on what your input text
+  looks like and what problem you are trying to solve.
   Lucene java's wiki page 
   <a href="http://wiki.apache.org/lucene-java/AnalysisParalysis">AnalysisParalysis</a> 
   provides some data on "analyzing your analyzer".
   Here are some rules of thumb:
   <ol>
     <li>Test test test... (did we say test?)</li>
-    <li>Beware of over analysis &ndash; might hurt indexing performance.</li>
-    <li>Start with same analyzer for indexing and search, otherwise searches would not find what they are supposed to...</li>
+    <li>Beware of too much analysis &ndash; it might hurt indexing performance.</li>
+    <li>Start with the same analyzer for indexing and search, otherwise searches would not find what they are supposed to...</li>
     <li>In some cases a different analyzer is required for indexing and search, for instance:
         <ul>
-           <li>Certain searches require more stop words to be filtered. (I.e. more than those that were filtered at indexing.)</li>
+           <li>Certain searches require more stop words to be filtered. (i.e. more than those that were filtered at indexing.)</li>
            <li>Query expansion by synonyms, acronyms, auto spell correction, etc.</li>
         </ul>
         This might sometimes require a modified analyzer &ndash; see the next section on how to do that.
     </li>
   </ol>
 </p>
-<h2>Implementing your own Analyzer</h2>
+<h2>Implementing your own Analyzer and Analysis Components</h2>
 <p>
-  Creating your own Analyzer is straightforward. Your Analyzer can wrap
+  Creating your own Analyzer is straightforward. Your Analyzer should subclass {@link org.apache.lucene.analysis.Analyzer}. It can use
   existing analysis components &mdash; CharFilter(s) <i>(optional)</i>, a
   Tokenizer, and TokenFilter(s) <i>(optional)</i> &mdash; or components you
   create, or a combination of existing and newly created components.  Before
@@ -271,10 +299,22 @@ and proximity searches (though sentence 
     }
   };
 </PRE>
+<h3>End of Input Cleanup</h3>
+<p>
+   At the ends of each field, Lucene will call the {@link org.apache.lucene.analysis.TokenStream#end()}.
+   The components of the token stream (the tokenizer and the token filters) <strong>must</strong>
+   put accurate values into the token attributes to reflect the situation at the end of the field.
+   The Offset attribute must contain the final offset (the total number of characters processed)
+   in both start and end. Attributes like PositionLength must be correct. 
+</p>
+<p>
+   The base method{@link org.apache.lucene.analysis.TokenStream#end()} sets PositionIncrement to 0, which is required.
+   Other components must override this method to fix up the other attributes.
+</p>
 <h3>Token Position Increments</h3>
 <p>
-   By default, all tokens created by Analyzers and Tokenizers have a 
-   {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
+   By default, TokenStream arranges for the 
+   {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of all tokens to be one.
    This means that the position stored for that token in the index would be one more than
    that of the previous token.
    Recall that phrase and proximity searches rely on position info.
@@ -292,7 +332,7 @@ and proximity searches (though sentence 
    configured to not take position increments into account when generating phrase queries.
 </p>
 <p>
-  Note that a StopFilter MUST increment the position increment in order not to generate corrupt
+  Note that a filter that filters <strong>out</strong> tokens <strong>must</strong> increment the position increment in order not to generate corrupt
   tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
 </p>
 <PRE class="prettyprint">
@@ -386,7 +426,15 @@ and proximity searches (though sentence 
   <li>The first position increment must be &gt; 0.</li>
   <li>Positions must not go backward.</li>
   <li>Tokens that have the same start position must have the same start offset.</li>
-  <li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
+  <li>Tokens that have the same end position (taking into account the
+  position length) must have the same end offset.</li>
+  <li>Tokenizers must call {@link
+  org.apache.lucene.util.AttributeSource#clearAttributes()} in
+  incrementToken().</li>
+  <li>Tokenizers must override {@link
+  org.apache.lucene.analysis.TokenStream#end()}, and pass the final
+  offset (the total number of input characters processed) to both
+  parameters of {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute#setOffset(int, int)}.</li>
 </ul>
 <p>
    Although these rules might seem easy to follow, problems can quickly happen when chaining
@@ -395,17 +443,19 @@ and proximity searches (though sentence 
 </p>
 <ul>
   <li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
-  <li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
+  <li>Token filters should not insert positions. If a filter needs to add tokens, then they should all have a position increment of 0.</li>
+  <li>When they add tokens, token filters should call {@link org.apache.lucene.util.AttributeSource#clearAttributes()} first.</li>
   <li>When they remove tokens, token filters should increment the position increment of the following token.</li>
   <li>Token filters should preserve position lengths.</li>
 </ul>
 <h2>TokenStream API</h2>
 <p>
-	"Flexible Indexing" summarizes the effort of making the Lucene indexer
+  "Flexible Indexing" summarizes the effort of making the Lucene indexer
   pluggable and extensible for custom index formats.  A fully customizable
   indexer means that users will be able to store custom data structures on
-  disk. Therefore an API is necessary that can transport custom types of
-  data from the documents to the indexer.
+  disk. Therefore the analysis API must transport custom types of
+  data from the documents to the indexer. (It also supports communications
+  amongst the analysis components.)
 </p>
 <h3>Attribute and AttributeSource</h3>
 <p>
@@ -467,6 +517,68 @@ and proximity searches (though sentence 
     </td>
   </tr>
 </table>
+<h3>More Requirements for Analysis Component Classes</h3>
+Due to the historical development of the API, there are some perhaps
+less than obvious requirements to implement analysis components
+classes.
+<h4 id="analysis-lifetime">Token Stream Lifetime</h4>
+The code fragment of the <a href="#analysis-workflow">analysis workflow
+protocol</a> above shows a token stream being obtained, used, and then
+left for garbage. However, that does not mean that the components of
+that token stream will, in fact, be discarded. The default is just the
+opposite. {@link org.apache.lucene.analysis.Analyzer} applies a reuse
+strategy to the tokenizer and the token filters. It will reuse
+them. For each new input, it calls {@link org.apache.lucene.analysis.Tokenizer#setReader(java.io.Reader)} 
+to set the input. Your components must be prepared for this scenario,
+as described below.
+<h4>Tokenizer</h4>
+<ul>
+  <li>
+  You should create your tokenizer class by extending {@link org.apache.lucene.analysis.Tokenizer}.
+  </li>
+  <li>
+  Your tokenizer <strong>must</strong> override {@link org.apache.lucene.analysis.TokenStream#end()}.
+  Your implementation <strong>must</strong> call
+  <code>super.end()</code>. It must set a correct final offset into
+  the offset attribute, and finish up and other attributes to reflect
+  the end of the stream.
+  </li>
+  <li>
+  If your tokenizer overrides {@link org.apache.lucene.analysis.TokenStream#reset()}
+  or {@link org.apache.lucene.analysis.TokenStream#close()}, it
+  <strong>must</strong> call the corresponding superclass method.
+  </li>
+</ul>
+<h4>Token Filter</h4>
+  You should create your token filter class by extending {@link org.apache.lucene.analysis.TokenFilter}.
+  If your token filter overrides {@link org.apache.lucene.analysis.TokenStream#reset()},
+  {@link org.apache.lucene.analysis.TokenStream#end()}
+  or {@link org.apache.lucene.analysis.TokenStream#close()}, it
+  <strong>must</strong> call the corresponding superclass method.
+<h4>Creating delegates</h4>
+  Forwarding classes (those which extend {@link org.apache.lucene.analysis.Tokenizer} but delegate
+  selected logic to another tokenizer) must also set the reader to the delegate in the overridden
+  {@link org.apache.lucene.analysis.Tokenizer#reset()} method, e.g.:
+  <pre class="prettyprint">
+    public class ForwardingTokenizer extends Tokenizer {
+       private Tokenizer delegate;
+       ...
+       {@literal @Override}
+       public void reset() {
+          super.reset();
+          delegate.setReader(this.input);
+          delegate.reset();
+       }
+    }
+  </pre>
+<h3>Testing Your Analysis Component</h3>
+<p>
+    The lucene-test-framework component defines
+    <a href="{@docRoot}/../test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.html">BaseTokenStreamTestCase</a>. By extending
+    this class, you can create JUnit tests that validate that your
+    Analyzer and/or analysis components correctly implement the
+    protocol. The checkRandomData methods of that class are particularly effective in flushing out errors.
+</p>
 <h3>Using the TokenStream API</h3>
 There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
 to walk through the example below first and come back to this section afterwards.
@@ -522,8 +634,8 @@ public class MyAnalyzer extends Analyzer
   }
 
   {@literal @Override}
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion, reader));
+  protected TokenStreamComponents createComponents(String fieldName) {
+    return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion));
   }
   
   public static void main(String[] args) throws IOException {
@@ -572,8 +684,8 @@ easily by adding a LengthFilter to the c
 <code>createComponents()</code> method in our analyzer needs to be changed:
 <pre class="prettyprint">
   {@literal @Override}
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
+  protected TokenStreamComponents createComponents(String fieldName) {
+    final Tokenizer source = new WhitespaceTokenizer(matchVersion);
     TokenStream result = new LengthFilter(true, source, 3, Integer.MAX_VALUE);
     return new TokenStreamComponents(source, result);
   }
@@ -776,8 +888,8 @@ public final class PartOfSpeechAttribute
 <p>Now we need to add the filter to the chain in MyAnalyzer:</p>
 <pre class="prettyprint">
   {@literal @Override}
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
+  protected TokenStreamComponents createComponents(String fieldName) {
+    final Tokenizer source = new WhitespaceTokenizer(matchVersion);
     TokenStream result = new LengthFilter(true, source, 3, Integer.MAX_VALUE);
     result = new PartOfSpeechTaggingFilter(result);
     return new TokenStreamComponents(source, result);
@@ -875,8 +987,8 @@ Example:
 public class MyAnalyzer extends Analyzer {
 
   {@literal @Override}
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    return new TokenStreamComponents(new MyTokenizer(reader));
+  protected TokenStreamComponents createComponents(String fieldName) {
+    return new TokenStreamComponents(new MyTokenizer());
   }
   
   {@literal @Override}

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/document/Field.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/document/Field.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/document/Field.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/document/Field.java Fri Jan 17 17:23:33 2014
@@ -588,7 +588,8 @@ public class Field implements IndexableF
     }
 
     @Override
-    public void end() {
+    public void end() throws IOException {
+      super.end();
       final int finalOffset = value.length();
       offsetAttribute.setOffset(finalOffset, finalOffset);
     }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java Fri Jan 17 17:23:33 2014
@@ -930,10 +930,20 @@ public class IndexWriter implements Clos
           closeInternal(waitForMerges, true);
         }
       }
-      assert eventQueue.isEmpty();
+      assert assertEventQueueAfterClose();
     }
   }
 
+  private boolean assertEventQueueAfterClose() {
+    if (eventQueue.isEmpty()) {
+      return true;
+    }
+    for (Event e : eventQueue) {
+      assert e instanceof DocumentsWriter.MergePendingEvent : e;
+    }
+    return true;
+  }
+
   // Returns true if this thread should attempt to close, or
   // false if IndexWriter is now closed; else, waits until
   // another thread finishes closing
@@ -2022,7 +2032,7 @@ public class IndexWriter implements Clos
         rollbackInternal();
       }
     }
-    assert eventQueue.isEmpty() : eventQueue;
+    assert assertEventQueueAfterClose();
   }
 
   private void rollbackInternal() throws IOException {

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java Fri Jan 17 17:23:33 2014
@@ -154,9 +154,12 @@ public class TieredMergePolicy extends M
   }
 
   /** Controls how aggressively merges that reclaim more
-   *  deletions are favored.  Higher values favor selecting
-   *  merges that reclaim deletions.  A value of 0.0 means
-   *  deletions don't impact merge selection. */
+   *  deletions are favored.  Higher values will more
+   *  aggressively target merges that reclaim deletions, but
+   *  be careful not to go so high that way too much merging
+   *  takes place; a value of 3.0 is probably nearly too
+   *  high.  A value of 0.0 means deletions don't impact
+   *  merge selection. */ 
   public TieredMergePolicy setReclaimDeletesWeight(double v) {
     if (v < 0.0) {
       throw new IllegalArgumentException("reclaimDeletesWeight must be >= 0.0 (got " + v + ")");
@@ -255,12 +258,16 @@ public class TieredMergePolicy extends M
    *  merge. */
   protected static abstract class MergeScore {
     /** Sole constructor. (For invocation by subclass 
-     * constructors, typically implicit.) */
+     *  constructors, typically implicit.) */
     protected MergeScore() {
     }
     
+    /** Returns the score for this merge candidate; lower
+     *  scores are better. */
     abstract double getScore();
 
+    /** Human readable explanation of how the merge got this
+     *  score. */
     abstract String getExplanation();
   }
 
@@ -437,9 +444,12 @@ public class TieredMergePolicy extends M
       totBeforeMergeBytes += info.sizeInBytes();
     }
 
-    // Measure "skew" of the merge, which can range
-    // from 1.0/numSegsBeingMerged (good) to 1.0
-    // (poor):
+    // Roughly measure "skew" of the merge, i.e. how
+    // "balanced" the merge is (whether the segments are
+    // about the same size), which can range from
+    // 1.0/numSegsBeingMerged (good) to 1.0 (poor). Heavily
+    // lopsided merges (skew near 1.0) is no good; it means
+    // O(N^2) merge cost over time:
     final double skew;
     if (hitTooLarge) {
       // Pretend the merge has perfect skew; skew doesn't

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java Fri Jan 17 17:23:33 2014
@@ -62,6 +62,18 @@ import org.apache.lucene.util.BytesRef;
  *  <li> {@link #compareBottom} Compare a new hit (docID)
  *       against the "weakest" (bottom) entry in the queue.
  *
+ *  <li> {@link #setTopValue} This method is called by
+ *       {@link TopFieldCollector} to notify the
+ *       FieldComparator of the top most value, which is
+ *       used by future calls to {@link #compareTop}.
+ *
+ *  <li> {@link #compareBottom} Compare a new hit (docID)
+ *       against the "weakest" (bottom) entry in the queue.
+ *
+ *  <li> {@link #compareTop} Compare a new hit (docID)
+ *       against the top value previously set by a call to
+ *       {@link #setTopValue}.
+ *
  *  <li> {@link #copy} Installs a new hit into the
  *       priority queue.  The {@link FieldValueHitQueue}
  *       calls this method when a new hit is competitive.
@@ -104,7 +116,15 @@ public abstract class FieldComparator<T>
   public abstract void setBottom(final int slot);
 
   /**
-   * Compare the bottom of the queue with doc.  This will
+   * Record the top value, for future calls to {@link
+   * #compareTop}.  This is only called for searches that
+   * use searchAfter (deep paging), and is called before any
+   * calls to {@link #setNextReader}.
+   */
+  public abstract void setTopValue(T value);
+
+  /**
+   * Compare the bottom of the queue with this doc.  This will
    * only invoked after setBottom has been called.  This
    * should return the same result as {@link
    * #compare(int,int)}} as if bottom were slot1 and the new
@@ -123,6 +143,22 @@ public abstract class FieldComparator<T>
   public abstract int compareBottom(int doc) throws IOException;
 
   /**
+   * Compare the top value with this doc.  This will
+   * only invoked after setTopValue has been called.  This
+   * should return the same result as {@link
+   * #compare(int,int)}} as if topValue were slot1 and the new
+   * document were slot 2.  This is only called for searches that
+   * use searchAfter (deep paging).
+   *    
+   * @param doc that was hit
+   * @return any N < 0 if the doc's value is sorted after
+   * the bottom entry (not competitive), any N > 0 if the
+   * doc's value is sorted before the bottom entry and 0 if
+   * they are equal.
+   */
+  public abstract int compareTop(int doc) throws IOException;
+
+  /**
    * This method is called when a new hit is competitive.
    * You should copy any state associated with this document
    * that will be required for future comparisons, into the
@@ -184,10 +220,6 @@ public abstract class FieldComparator<T>
     }
   }
 
-  /** Returns negative result if the doc's value is less
-   *  than the provided value. */
-  public abstract int compareDocToValue(int doc, T value) throws IOException;
-
   /**
    * Base FieldComparator class for numeric types
    */
@@ -223,6 +255,7 @@ public abstract class FieldComparator<T>
     private final DoubleParser parser;
     private FieldCache.Doubles currentReaderValues;
     private double bottom;
+    private double topValue;
 
     DoubleComparator(int numHits, String field, FieldCache.Parser parser, Double missingValue) {
       super(field, missingValue);
@@ -273,20 +306,24 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(Double value) {
+      topValue = value;
+    }
+
+    @Override
     public Double value(int slot) {
       return Double.valueOf(values[slot]);
     }
 
     @Override
-    public int compareDocToValue(int doc, Double valueObj) {
-      final double value = valueObj.doubleValue();
+    public int compareTop(int doc) {
       double docValue = currentReaderValues.get(doc);
       // Test for docValue == 0 to save Bits.get method call for
       // the common case (doc has value and value is non-zero):
       if (docsWithField != null && docValue == 0 && !docsWithField.get(doc)) {
         docValue = missingValue;
       }
-      return Double.compare(docValue, value);
+      return Double.compare(topValue, docValue);
     }
   }
 
@@ -297,6 +334,7 @@ public abstract class FieldComparator<T>
     private final FloatParser parser;
     private FieldCache.Floats currentReaderValues;
     private float bottom;
+    private float topValue;
 
     FloatComparator(int numHits, String field, FieldCache.Parser parser, Float missingValue) {
       super(field, missingValue);
@@ -348,20 +386,24 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(Float value) {
+      topValue = value;
+    }
+
+    @Override
     public Float value(int slot) {
       return Float.valueOf(values[slot]);
     }
 
     @Override
-    public int compareDocToValue(int doc, Float valueObj) {
-      final float value = valueObj.floatValue();
+    public int compareTop(int doc) {
       float docValue = currentReaderValues.get(doc);
       // Test for docValue == 0 to save Bits.get method call for
       // the common case (doc has value and value is non-zero):
       if (docsWithField != null && docValue == 0 && !docsWithField.get(doc)) {
         docValue = missingValue;
       }
-      return Float.compare(docValue, value);
+      return Float.compare(topValue, docValue);
     }
   }
 
@@ -372,6 +414,7 @@ public abstract class FieldComparator<T>
     private final IntParser parser;
     private FieldCache.Ints currentReaderValues;
     private int bottom;                           // Value of bottom of queue
+    private int topValue;
 
     IntComparator(int numHits, String field, FieldCache.Parser parser, Integer missingValue) {
       super(field, missingValue);
@@ -422,20 +465,24 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(Integer value) {
+      topValue = value;
+    }
+
+    @Override
     public Integer value(int slot) {
       return Integer.valueOf(values[slot]);
     }
 
     @Override
-    public int compareDocToValue(int doc, Integer valueObj) {
-      final int value = valueObj.intValue();
+    public int compareTop(int doc) {
       int docValue = currentReaderValues.get(doc);
       // Test for docValue == 0 to save Bits.get method call for
       // the common case (doc has value and value is non-zero):
       if (docsWithField != null && docValue == 0 && !docsWithField.get(doc)) {
         docValue = missingValue;
       }
-      return Integer.compare(docValue, value);
+      return Integer.compare(topValue, docValue);
     }
   }
 
@@ -446,6 +493,7 @@ public abstract class FieldComparator<T>
     private final LongParser parser;
     private FieldCache.Longs currentReaderValues;
     private long bottom;
+    private long topValue;
 
     LongComparator(int numHits, String field, FieldCache.Parser parser, Long missingValue) {
       super(field, missingValue);
@@ -498,20 +546,24 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(Long value) {
+      topValue = value;
+    }
+
+    @Override
     public Long value(int slot) {
       return Long.valueOf(values[slot]);
     }
 
     @Override
-    public int compareDocToValue(int doc, Long valueObj) {
-      final long value = valueObj.longValue();
+    public int compareTop(int doc) {
       long docValue = currentReaderValues.get(doc);
       // Test for docValue == 0 to save Bits.get method call for
       // the common case (doc has value and value is non-zero):
       if (docsWithField != null && docValue == 0 && !docsWithField.get(doc)) {
         docValue = missingValue;
       }
-      return Long.compare(docValue, value);
+      return Long.compare(topValue, docValue);
     }
   }
 
@@ -525,7 +577,8 @@ public abstract class FieldComparator<T>
     private final float[] scores;
     private float bottom;
     private Scorer scorer;
-    
+    private float topValue;
+
     RelevanceComparator(int numHits) {
       scores = new float[numHits];
     }
@@ -559,6 +612,11 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(Float value) {
+      topValue = value;
+    }
+
+    @Override
     public void setScorer(Scorer scorer) {
       // wrap with a ScoreCachingWrappingScorer so that successive calls to
       // score() will not incur score computation over and
@@ -584,11 +642,10 @@ public abstract class FieldComparator<T>
     }
 
     @Override
-    public int compareDocToValue(int doc, Float valueObj) throws IOException {
-      final float value = valueObj.floatValue();
+    public int compareTop(int doc) throws IOException {
       float docValue = scorer.score();
       assert !Float.isNaN(docValue);
-      return Float.compare(value, docValue);
+      return Float.compare(docValue, topValue);
     }
   }
 
@@ -597,6 +654,7 @@ public abstract class FieldComparator<T>
     private final int[] docIDs;
     private int docBase;
     private int bottom;
+    private int topValue;
 
     DocComparator(int numHits) {
       docIDs = new int[numHits];
@@ -634,15 +692,19 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(Integer value) {
+      topValue = value;
+    }
+
+    @Override
     public Integer value(int slot) {
       return Integer.valueOf(docIDs[slot]);
     }
 
     @Override
-    public int compareDocToValue(int doc, Integer valueObj) {
-      final int value = valueObj.intValue();
+    public int compareTop(int doc) {
       int docValue = docBase + doc;
-      return Integer.compare(docValue, value);
+      return Integer.compare(topValue, docValue);
     }
   }
   
@@ -700,13 +762,42 @@ public abstract class FieldComparator<T>
       @lucene.internal */
     BytesRef bottomValue;
 
+    /** Set by setTopValue. */
+    BytesRef topValue;
+    boolean topSameReader;
+    int topOrd;
+
+    private int docBase;
+
     final BytesRef tempBR = new BytesRef();
 
+    /** -1 if missing values are sorted first, 1 if they are
+     *  sorted last */
+    final int missingSortCmp;
+    
+    /** Which ordinal to use for a missing value. */
+    final int missingOrd;
+
+    /** Creates this, sorting missing values first. */
     public TermOrdValComparator(int numHits, String field) {
+      this(numHits, field, false);
+    }
+
+    /** Creates this, with control over how missing values
+     *  are sorted.  Pass sortMissingLast=true to put
+     *  missing values at the end. */
+    public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
       ords = new int[numHits];
       values = new BytesRef[numHits];
       readerGen = new int[numHits];
       this.field = field;
+      if (sortMissingLast) {
+        missingSortCmp = 1;
+        missingOrd = Integer.MAX_VALUE;
+      } else {
+        missingSortCmp = -1;
+        missingOrd = -1;
+      }
     }
 
     @Override
@@ -721,140 +812,78 @@ public abstract class FieldComparator<T>
         if (val2 == null) {
           return 0;
         }
-        return -1;
+        return missingSortCmp;
       } else if (val2 == null) {
-        return 1;
+        return -missingSortCmp;
       }
       return val1.compareTo(val2);
     }
 
     @Override
     public int compareBottom(int doc) {
-      throw new UnsupportedOperationException();
+      assert bottomSlot != -1;
+      int docOrd = termsIndex.getOrd(doc);
+      if (docOrd == -1) {
+        docOrd = missingOrd;
+      }
+      if (bottomSameReader) {
+        // ord is precisely comparable, even in the equal case
+        return bottomOrd - docOrd;
+      } else if (bottomOrd >= docOrd) {
+        // the equals case always means bottom is > doc
+        // (because we set bottomOrd to the lower bound in
+        // setBottom):
+        return 1;
+      } else {
+        return -1;
+      }
     }
 
     @Override
     public void copy(int slot, int doc) {
-      throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public int compareDocToValue(int doc, BytesRef value) {
       int ord = termsIndex.getOrd(doc);
       if (ord == -1) {
-        if (value == null) {
-          return 0;
-        }
-        return -1;
-      } else if (value == null) {
-        return 1;
-      }
-      termsIndex.lookupOrd(ord, tempBR);
-      return tempBR.compareTo(value);
-    }
-
-    /** Base class for specialized (per bit width of the
-     * ords) per-segment comparator.  NOTE: this is messy;
-     * we do this only because hotspot can't reliably inline
-     * the underlying array access when looking up doc->ord
-     * @lucene.internal
-     */
-    abstract class PerSegmentComparator extends FieldComparator<BytesRef> {
-      
-      @Override
-      public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
-        return TermOrdValComparator.this.setNextReader(context);
-      }
-
-      @Override
-      public int compare(int slot1, int slot2) {
-        return TermOrdValComparator.this.compare(slot1, slot2);
-      }
-
-      @Override
-      public void setBottom(final int bottom) {
-        TermOrdValComparator.this.setBottom(bottom);
-      }
-
-      @Override
-      public BytesRef value(int slot) {
-        return TermOrdValComparator.this.value(slot);
-      }
-
-      @Override
-      public int compareValues(BytesRef val1, BytesRef val2) {
-        if (val1 == null) {
-          if (val2 == null) {
-            return 0;
-          }
-          return -1;
-        } else if (val2 == null) {
-          return 1;
+        ord = missingOrd;
+        values[slot] = null;
+      } else {
+        assert ord >= 0;
+        if (values[slot] == null) {
+          values[slot] = new BytesRef();
         }
-        return val1.compareTo(val2);
-      }
-
-      @Override
-      public int compareDocToValue(int doc, BytesRef value) {
-        return TermOrdValComparator.this.compareDocToValue(doc, value);
+        termsIndex.lookupOrd(ord, values[slot]);
       }
+      ords[slot] = ord;
+      readerGen[slot] = currentReaderGen;
     }
+    
+    @Override
+    public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
+      docBase = context.docBase;
+      termsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), field);
+      currentReaderGen++;
 
-    // Used per-segment when docToOrd is null:
-    private final class AnyOrdComparator extends PerSegmentComparator {
-      private final SortedDocValues termsIndex;
-      private final int docBase;
-
-      public AnyOrdComparator(SortedDocValues termsIndex, int docBase) {
-        this.termsIndex = termsIndex;
-        this.docBase = docBase;
-      }
-
-      @Override
-      public int compareBottom(int doc) {
-        assert bottomSlot != -1;
-        final int docOrd = termsIndex.getOrd(doc);
-        if (bottomSameReader) {
-          // ord is precisely comparable, even in the equal case
-          return bottomOrd - docOrd;
-        } else if (bottomOrd >= docOrd) {
-          // the equals case always means bottom is > doc
-          // (because we set bottomOrd to the lower bound in
-          // setBottom):
-          return 1;
-        } else {
-          return -1;
-        }
-      }
-
-      @Override
-      public void copy(int slot, int doc) {
-        final int ord = termsIndex.getOrd(doc);
-        ords[slot] = ord;
-        if (ord == -1) {
-          values[slot] = null;
+      if (topValue != null) {
+        // Recompute topOrd/SameReader
+        int ord = termsIndex.lookupTerm(topValue);
+        if (ord >= 0) {
+          topSameReader = true;
+          topOrd = ord;
         } else {
-          assert ord >= 0;
-          if (values[slot] == null) {
-            values[slot] = new BytesRef();
-          }
-          termsIndex.lookupOrd(ord, values[slot]);
+          topSameReader = false;
+          topOrd = -ord-2;
         }
-        readerGen[slot] = currentReaderGen;
+      } else {
+        topOrd = missingOrd;
+        topSameReader = true;
       }
-    }
+      //System.out.println("  setNextReader topOrd=" + topOrd + " topSameReader=" + topSameReader);
 
-    @Override
-    public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
-      final int docBase = context.docBase;
-      termsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), field);
-      FieldComparator<BytesRef> perSegComp = new AnyOrdComparator(termsIndex, docBase);
-      currentReaderGen++;
       if (bottomSlot != -1) {
-        perSegComp.setBottom(bottomSlot);
+        // Recompute bottomOrd/SameReader
+        setBottom(bottomSlot);
       }
 
-      return perSegComp;
+      return this;
     }
     
     @Override
@@ -867,18 +896,18 @@ public abstract class FieldComparator<T>
         bottomSameReader = true;
       } else {
         if (bottomValue == null) {
-          // -1 ord is null for all segments
-          assert ords[bottomSlot] == -1;
-          bottomOrd = -1;
+          // missingOrd is null for all segments
+          assert ords[bottomSlot] == missingOrd;
+          bottomOrd = missingOrd;
           bottomSameReader = true;
           readerGen[bottomSlot] = currentReaderGen;
         } else {
-          final int index = termsIndex.lookupTerm(bottomValue);
-          if (index < 0) {
-            bottomOrd = -index - 2;
+          final int ord = termsIndex.lookupTerm(bottomValue);
+          if (ord < 0) {
+            bottomOrd = -ord - 2;
             bottomSameReader = false;
           } else {
-            bottomOrd = index;
+            bottomOrd = ord;
             // exact value match
             bottomSameReader = true;
             readerGen[bottomSlot] = currentReaderGen;            
@@ -889,27 +918,76 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(BytesRef value) {
+      // null is fine: it means the last doc of the prior
+      // search was missing this value
+      topValue = value;
+      //System.out.println("setTopValue " + topValue);
+    }
+
+    @Override
     public BytesRef value(int slot) {
       return values[slot];
     }
+
+    @Override
+    public int compareTop(int doc) {
+
+      int ord = termsIndex.getOrd(doc);
+      if (ord == -1) {
+        ord = missingOrd;
+      }
+
+      if (topSameReader) {
+        // ord is precisely comparable, even in the equal
+        // case
+        //System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
+        return topOrd - ord;
+      } else if (ord <= topOrd) {
+        // the equals case always means doc is < value
+        // (because we set lastOrd to the lower bound)
+        return 1;
+      } else {
+        return -1;
+      }
+    }
+
+    @Override
+    public int compareValues(BytesRef val1, BytesRef val2) {
+      if (val1 == null) {
+        if (val2 == null) {
+          return 0;
+        }
+        return missingSortCmp;
+      } else if (val2 == null) {
+        return -missingSortCmp;
+      }
+      return val1.compareTo(val2);
+    }
   }
   
-  // just used internally in this comparator
-  private static final byte[] MISSING_BYTES = new byte[0];
-
   /** Sorts by field's natural Term sort order.  All
    *  comparisons are done using BytesRef.compareTo, which is
    *  slow for medium to large result sets but possibly
    *  very fast for very small results sets. */
+  // TODO: should we remove this?  who really uses it?
   public static final class TermValComparator extends FieldComparator<BytesRef> {
 
+    // sentinels, just used internally in this comparator
+    private static final byte[] MISSING_BYTES = new byte[0];
+    private static final byte[] NON_MISSING_BYTES = new byte[0];
+
     private BytesRef[] values;
     private BinaryDocValues docTerms;
     private Bits docsWithField;
     private final String field;
     private BytesRef bottom;
+    private BytesRef topValue;
     private final BytesRef tempBR = new BytesRef();
 
+    // TODO: add missing first/last support here?
+
+    /** Sole constructor. */
     TermValComparator(int numHits, String field) {
       values = new BytesRef[numHits];
       this.field = field;
@@ -919,12 +997,12 @@ public abstract class FieldComparator<T>
     public int compare(int slot1, int slot2) {
       final BytesRef val1 = values[slot1];
       final BytesRef val2 = values[slot2];
-      if (val1 == null) {
-        if (val2 == null) {
+      if (val1.bytes == MISSING_BYTES) {
+        if (val2.bytes == MISSING_BYTES) {
           return 0;
         }
         return -1;
-      } else if (val2 == null) {
+      } else if (val2.bytes == MISSING_BYTES) {
         return 1;
       }
 
@@ -934,18 +1012,8 @@ public abstract class FieldComparator<T>
     @Override
     public int compareBottom(int doc) {
       docTerms.get(doc, tempBR);
-      if (tempBR.length == 0 && docsWithField.get(doc) == false) {
-        tempBR.bytes = MISSING_BYTES;
-      }
-      if (bottom.bytes == MISSING_BYTES) {
-        if (tempBR.bytes == MISSING_BYTES) {
-          return 0;
-        }
-        return -1;
-      } else if (tempBR.bytes == MISSING_BYTES) {
-        return 1;
-      }
-      return bottom.compareTo(tempBR);
+      setMissingBytes(doc, tempBR);
+      return compareValues(bottom, tempBR);
     }
 
     @Override
@@ -954,9 +1022,7 @@ public abstract class FieldComparator<T>
         values[slot] = new BytesRef();
       }
       docTerms.get(doc, values[slot]);
-      if (values[slot].length == 0 && docsWithField.get(doc) == false) {
-        values[slot].bytes = MISSING_BYTES;
-      }
+      setMissingBytes(doc, values[slot]);
     }
 
     @Override
@@ -972,30 +1038,48 @@ public abstract class FieldComparator<T>
     }
 
     @Override
+    public void setTopValue(BytesRef value) {
+      if (value == null) {
+        throw new IllegalArgumentException("value cannot be null");
+      }
+      topValue = value;
+    }
+
+    @Override
     public BytesRef value(int slot) {
       return values[slot];
     }
 
     @Override
     public int compareValues(BytesRef val1, BytesRef val2) {
-      if (val1 == null) {
-        if (val2 == null) {
+      // missing always sorts first:
+      if (val1.bytes == MISSING_BYTES) {
+        if (val2.bytes == MISSING_BYTES) {
           return 0;
         }
         return -1;
-      } else if (val2 == null) {
+      } else if (val2.bytes == MISSING_BYTES) {
         return 1;
       }
       return val1.compareTo(val2);
     }
 
     @Override
-    public int compareDocToValue(int doc, BytesRef value) {
+    public int compareTop(int doc) {
       docTerms.get(doc, tempBR);
-      if (tempBR.length == 0 && docsWithField.get(doc) == false) {
-        tempBR.bytes = MISSING_BYTES;
+      setMissingBytes(doc, tempBR);
+      return compareValues(topValue, tempBR);
+    }
+
+    private void setMissingBytes(int doc, BytesRef br) {
+      if (br.length == 0) {
+        br.offset = 0;
+        if (docsWithField.get(doc) == false) {
+          br.bytes = MISSING_BYTES;
+        } else {
+          br.bytes = NON_MISSING_BYTES;
+        }
       }
-      return tempBR.compareTo(value);
     }
   }
 }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldDoc.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldDoc.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldDoc.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/FieldDoc.java Fri Jan 17 17:23:33 2014
@@ -17,6 +17,8 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
+import java.util.Arrays;
+
 /**
  * Expert: A ScoreDoc which also contains information about
  * how to sort the referenced document.  In addition to the
@@ -69,14 +71,10 @@ public class FieldDoc extends ScoreDoc {
   @Override
   public String toString() {
     // super.toString returns the doc and score information, so just add the
-          // fields information
+    // fields information
     StringBuilder sb = new StringBuilder(super.toString());
-    sb.append("[");
-    for (int i = 0; i < fields.length; i++) {
-            sb.append(fields[i]).append(", ");
-          }
-    sb.setLength(sb.length() - 2); // discard last ", "
-    sb.append("]");
+    sb.append(" fields=");
+    sb.append(Arrays.toString(fields));
     return sb.toString();
   }
 }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/SortField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/SortField.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/SortField.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/SortField.java Fri Jan 17 17:23:33 2014
@@ -106,6 +106,9 @@ public class SortField {
   // Used for 'sortMissingFirst/Last'
   public Object missingValue = null;
 
+  // Only used with type=STRING
+  public boolean sortMissingLast;
+
   /** Creates a sort by terms in the given field with the type of term
    * values explicitly given.
    * @param field  Name of field to sort by.  Can be <code>null</code> if
@@ -165,13 +168,34 @@ public class SortField {
     this.reverse = reverse;
     this.parser = parser;
   }
+
+  /** Pass this to {@link #setMissingValue} to have missing
+   *  string values sort first. */
+  public final static Object STRING_FIRST = new Object() {
+      @Override
+      public String toString() {
+        return "SortField.STRING_FIRST";
+      }
+    };
   
-  public SortField setMissingValue(Object missingValue) {
-    if (type != Type.INT && type != Type.FLOAT && type != Type.LONG && type != Type.DOUBLE) {
-      throw new IllegalArgumentException( "Missing value only works for numeric types" );
+  /** Pass this to {@link #setMissingValue} to have missing
+   *  string values sort last. */
+  public final static Object STRING_LAST = new Object() {
+      @Override
+      public String toString() {
+        return "SortField.STRING_LAST";
+      }
+    };
+
+  public void setMissingValue(Object missingValue) {
+    if (type == Type.STRING) {
+      if (missingValue != STRING_FIRST && missingValue != STRING_LAST) {
+        throw new IllegalArgumentException("For STRING type, missing value must be either STRING_FIRST or STRING_LAST");
+      }
+    } else if (type != Type.INT && type != Type.FLOAT && type != Type.LONG && type != Type.DOUBLE) {
+      throw new IllegalArgumentException("Missing value only works for numeric or STRING types");
     }
     this.missingValue = missingValue;
-    return this;
   }
 
   /** Creates a sort with a custom comparison function.
@@ -294,6 +318,10 @@ public class SortField {
     }
 
     if (reverse) buffer.append('!');
+    if (missingValue != null) {
+      buffer.append(" missingValue=");
+      buffer.append(missingValue);
+    }
 
     return buffer.toString();
   }
@@ -376,9 +404,10 @@ public class SortField {
       return comparatorSource.newComparator(field, numHits, sortPos, reverse);
 
     case STRING:
-      return new FieldComparator.TermOrdValComparator(numHits, field);
+      return new FieldComparator.TermOrdValComparator(numHits, field, missingValue == STRING_LAST);
 
     case STRING_VAL:
+      // TODO: should we remove this?  who really uses it?
       return new FieldComparator.TermValComparator(numHits, field);
 
     case REWRITEABLE:

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/search/TopFieldCollector.java Fri Jan 17 17:23:33 2014
@@ -869,6 +869,13 @@ public abstract class TopFieldCollector 
 
       // Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN.
       maxScore = Float.NEGATIVE_INFINITY;
+
+      // Tell all comparators their top value:
+      for(int i=0;i<comparators.length;i++) {
+        @SuppressWarnings("unchecked")
+        FieldComparator<Object> comparator = (FieldComparator<Object>) comparators[i];
+        comparator.setTopValue(after.fields[i]);
+      }
     }
     
     void updateBottom(int doc, float score) {
@@ -880,37 +887,9 @@ public abstract class TopFieldCollector 
     @SuppressWarnings({"unchecked", "rawtypes"})
     @Override
     public void collect(int doc) throws IOException {
-      totalHits++;
-
       //System.out.println("  collect doc=" + doc);
 
-      // Check if this hit was already collected on a
-      // previous page:
-      boolean sameValues = true;
-      for(int compIDX=0;compIDX<comparators.length;compIDX++) {
-        final FieldComparator comp = comparators[compIDX];
-
-        final int cmp = reverseMul[compIDX] * comp.compareDocToValue(doc, after.fields[compIDX]);
-        if (cmp < 0) {
-          // Already collected on a previous page
-          //System.out.println("    skip: before");
-          return;
-        } else if (cmp > 0) {
-          // Not yet collected
-          sameValues = false;
-          //System.out.println("    keep: after");
-          break;
-        }
-      }
-
-      // Tie-break by docID:
-      if (sameValues && doc <= afterDoc) {
-        // Already collected on a previous page
-        //System.out.println("    skip: tie-break");
-        return;
-      }
-
-      collectedHits++;
+      totalHits++;
 
       float score = Float.NaN;
       if (trackMaxScore) {
@@ -921,7 +900,8 @@ public abstract class TopFieldCollector 
       }
 
       if (queueFull) {
-        // Fastmatch: return if this hit is not competitive
+        // Fastmatch: return if this hit is no better than
+        // the worst hit currently in the queue:
         for (int i = 0;; i++) {
           final int c = reverseMul[i] * comparators[i].compareBottom(doc);
           if (c < 0) {
@@ -939,7 +919,35 @@ public abstract class TopFieldCollector 
             break;
           }
         }
+      }
+
+      // Check if this hit was already collected on a
+      // previous page:
+      boolean sameValues = true;
+      for(int compIDX=0;compIDX<comparators.length;compIDX++) {
+        final FieldComparator comp = comparators[compIDX];
+
+        final int cmp = reverseMul[compIDX] * comp.compareTop(doc);
+        if (cmp > 0) {
+          // Already collected on a previous page
+          //System.out.println("    skip: before");
+          return;
+        } else if (cmp < 0) {
+          // Not yet collected
+          sameValues = false;
+          //System.out.println("    keep: after; reverseMul=" + reverseMul[compIDX]);
+          break;
+        }
+      }
 
+      // Tie-break by docID:
+      if (sameValues && doc <= afterDoc) {
+        // Already collected on a previous page
+        //System.out.println("    skip: tie-break");
+        return;
+      }
+
+      if (queueFull) {
         // This hit is competitive - replace bottom element in queue & adjustTop
         for (int i = 0; i < comparators.length; i++) {
           comparators[i].copy(bottom.slot, doc);
@@ -955,6 +963,8 @@ public abstract class TopFieldCollector 
           comparators[i].setBottom(bottom.slot);
         }
       } else {
+        collectedHits++;
+
         // Startup transient: queue hasn't gathered numHits yet
         final int slot = collectedHits - 1;
         //System.out.println("    slot=" + slot);

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java Fri Jan 17 17:23:33 2014
@@ -242,12 +242,6 @@ public abstract class FSDirectory extend
     return file.exists();
   }
 
-  /** Returns the time the named file was last modified. */
-  public static long fileModified(File directory, String name) {
-    File file = new File(directory, name);
-    return file.lastModified();
-  }
-
   /** Returns the length in bytes of a file in the directory. */
   @Override
   public long fileLength(String name) throws IOException {

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/SloppyMath.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/SloppyMath.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/SloppyMath.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/SloppyMath.java Fri Jan 17 17:23:33 2014
@@ -44,10 +44,16 @@ public class SloppyMath {
   public static double haversin(double lat1, double lon1, double lat2, double lon2) {
     double x1 = lat1 * TO_RADIANS;
     double x2 = lat2 * TO_RADIANS;
-    double h1 = (1 - cos(x1 - x2)) / 2;
-    double h2 = (1 - cos((lon1 - lon2) * TO_RADIANS)) / 2;
-    double h = h1 + cos(x1) * cos(x2) * h2;
-    return TO_KILOMETERS * 2 * asin(Math.min(1, Math.sqrt(h)));
+    double h1 = 1 - cos(x1 - x2);
+    double h2 = 1 - cos((lon1 - lon2) * TO_RADIANS);
+    double h = (h1 + cos(x1) * cos(x2) * h2) / 2;
+
+    double avgLat = Math.abs((x1 + x2) / 2d);
+    int index = (int)(avgLat * RADIUS_INDEXER + 0.5) % earthDiameterPerLatitude.length;
+    double radius = earthDiameterPerLatitude[index];
+
+    return radius * asin(Math.min(1, Math.sqrt(h)));
+    
   }
 
   /**
@@ -134,7 +140,6 @@ public class SloppyMath {
   
   // haversin
   private static final double TO_RADIANS = Math.PI / 180D;
-  private static final double TO_KILOMETERS = 6371.0087714D;
   
   // cos/asin
   private static final double ONE_DIV_F2 = 1/2.0;
@@ -184,6 +189,11 @@ public class SloppyMath {
   private static final double ASIN_QS3 = Double.longBitsToDouble(0xbfe6066c1b8d0159L); // -6.88283971605453293030e-01
   private static final double ASIN_QS4 = Double.longBitsToDouble(0x3fb3b8c5b12e9282L); //  7.70381505559019352791e-02
   
+  private static final int RADIUS_TABS_SIZE = (1<<10) + 1;
+  private static final double RADIUS_DELTA = (StrictMath.PI/2d) / (RADIUS_TABS_SIZE - 1);
+  private static final double RADIUS_INDEXER = 1d/RADIUS_DELTA;
+  private static final double[] earthDiameterPerLatitude = new double[RADIUS_TABS_SIZE];
+  
   /** Initializes look-up tables. */
   static {
     // sin and cos
@@ -226,5 +236,27 @@ public class SloppyMath {
       asinDer3DivF3Tab[i] = ((1+2*x*x)*oneMinusXSqInv2_5) * ONE_DIV_F3;
       asinDer4DivF4Tab[i] = ((5+2*x*(2+x*(5-2*x)))*oneMinusXSqInv3_5) * ONE_DIV_F4;
     }
+    
+    
+    // WGS84 earth-ellipsoid major (a) and minor (b) radius
+    final double a = 6_378_137; // [m]
+    final double b = 6_356_752.31420; // [m]
+    
+    final double a2 = a*a;
+    final double b2 = b*b;
+    
+    earthDiameterPerLatitude[0] = 2 * a / 1000d;
+    earthDiameterPerLatitude[RADIUS_TABS_SIZE-1] = 2 * b / 1000d;
+    // earth radius
+    for (int i=1;i<RADIUS_TABS_SIZE-1;i++) {
+      final double lat = Math.PI * i / (2d * RADIUS_TABS_SIZE-1);
+      double one = StrictMath.pow(a2 * StrictMath.cos(lat), 2); 
+      double two = StrictMath.pow(b2 * StrictMath.sin(lat), 2);
+      double three = StrictMath.pow(a * StrictMath.cos(lat), 2);
+      double four = StrictMath.pow(b * StrictMath.sin(lat), 2);
+      
+      double radius = StrictMath.sqrt((one+two)/(three+four));
+      earthDiameterPerLatitude[i] = 2 * radius / 1000d;
+    }
   }
 }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java Fri Jan 17 17:23:33 2014
@@ -96,7 +96,7 @@ package org.apache.lucene.util;
 public final class UnicodeUtil {
   
   /** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
-   *  one would normally encounter, and definitely bigger than any UTF-8 terms.
+   *  (e.g. collation keys) one would normally encounter, and definitely bigger than any UTF-8 terms.
    *  <p>
    *  WARNING: This is not a valid UTF8 Term  
    **/

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearch.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearch.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearch.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearch.java Fri Jan 17 17:23:33 2014
@@ -92,7 +92,7 @@ public class TestSearch extends LuceneTe
       doTestSearch(random(), pw, false);
       pw.close();
       sw.close();
-      String multiFileOutput = sw.getBuffer().toString();
+      String multiFileOutput = sw.toString();
       //System.out.println(multiFileOutput);
 
       sw = new StringWriter();
@@ -100,7 +100,7 @@ public class TestSearch extends LuceneTe
       doTestSearch(random(), pw, true);
       pw.close();
       sw.close();
-      String singleFileOutput = sw.getBuffer().toString();
+      String singleFileOutput = sw.toString();
 
       assertEquals(multiFileOutput, singleFileOutput);
     }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearchForDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearchForDuplicates.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearchForDuplicates.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/TestSearchForDuplicates.java Fri Jan 17 17:23:33 2014
@@ -53,7 +53,7 @@ public class TestSearchForDuplicates ext
       doTest(random(), pw, false, MAX_DOCS);
       pw.close();
       sw.close();
-      String multiFileOutput = sw.getBuffer().toString();
+      String multiFileOutput = sw.toString();
       //System.out.println(multiFileOutput);
 
       sw = new StringWriter();
@@ -61,7 +61,7 @@ public class TestSearchForDuplicates ext
       doTest(random(), pw, true, MAX_DOCS);
       pw.close();
       sw.close();
-      String singleFileOutput = sw.getBuffer().toString();
+      String singleFileOutput = sw.toString();
 
       assertEquals(multiFileOutput, singleFileOutput);
   }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java Fri Jan 17 17:23:33 2014
@@ -17,13 +17,9 @@ package org.apache.lucene.analysis;
  * limitations under the License.
  */
 
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Reader;
 import java.io.StringWriter;
 import java.io.PrintWriter;
-import java.io.Writer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -63,10 +59,6 @@ public class TestGraphTokenizers extends
     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
     private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
 
-    public GraphTokenizer(Reader input) {
-      super(input);
-    }
-
     @Override
     public void reset() throws IOException {
       super.reset();
@@ -174,8 +166,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             return new TokenStreamComponents(t, t2);
           }
@@ -196,8 +188,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new GraphTokenizer(reader);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new GraphTokenizer();
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             return new TokenStreamComponents(t, t2);
           }
@@ -258,8 +250,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             final TokenStream t3 = new RemoveATokens(t2);
             return new TokenStreamComponents(t, t3);
@@ -285,8 +277,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new RemoveATokens(t);
             final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
             return new TokenStreamComponents(t, t3);
@@ -312,8 +304,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             return new TokenStreamComponents(t, t2);
           }
@@ -336,8 +328,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t1 = new MockGraphTokenFilter(random(), t);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
             return new TokenStreamComponents(t, t2);
@@ -360,8 +352,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t1 = new MockGraphTokenFilter(random(), t);
             final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
             return new TokenStreamComponents(t, t2);
@@ -384,8 +376,8 @@ public class TestGraphTokenizers extends
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
-          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-            final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
             return new TokenStreamComponents(t, t2);

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestLookaheadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestLookaheadTokenFilter.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestLookaheadTokenFilter.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestLookaheadTokenFilter.java Fri Jan 17 17:23:33 2014
@@ -26,9 +26,9 @@ public class TestLookaheadTokenFilter ex
   public void testRandomStrings() throws Exception {
     Analyzer a = new Analyzer() {
       @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      protected TokenStreamComponents createComponents(String fieldName) {
         Random random = random();
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random.nextBoolean());
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, random.nextBoolean());
         TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer);
         return new TokenStreamComponents(tokenizer, output);
       }
@@ -55,8 +55,8 @@ public class TestLookaheadTokenFilter ex
   public void testNeverCallingPeek() throws Exception {
     Analyzer a = new Analyzer() {
       @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, random().nextBoolean());
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, random().nextBoolean());
         TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer);
         return new TokenStreamComponents(tokenizer, output);
       }
@@ -67,9 +67,8 @@ public class TestLookaheadTokenFilter ex
   public void testMissedFirstToken() throws Exception {
     Analyzer analyzer = new Analyzer() {
       @Override
-      protected TokenStreamComponents createComponents(String fieldName,
-                                                       Reader reader) {
-        Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         TrivialLookaheadFilter filter = new TrivialLookaheadFilter(source);
         return new TokenStreamComponents(source, filter);
      }

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java Fri Jan 17 17:23:33 2014
@@ -187,8 +187,8 @@ public class TestMockAnalyzer extends Ba
   public void testTooLongToken() throws Exception {
     Analyzer whitespace = new Analyzer() {
       @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5);
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false, 5);
         return new TokenStreamComponents(t, t);
       }
     };
@@ -235,8 +235,8 @@ public class TestMockAnalyzer extends Ba
       final int limit = _TestUtil.nextInt(random(), 0, 500);
       Analyzer a = new Analyzer() {
         @Override
-        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-          Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer t = new MockTokenizer(dfa, lowercase, limit);
           return new TokenStreamComponents(t, t);
         }
       };

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockCharFilter.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockCharFilter.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestMockCharFilter.java Fri Jan 17 17:23:33 2014
@@ -26,8 +26,8 @@ public class TestMockCharFilter extends 
     Analyzer analyzer = new Analyzer() {
 
       @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, tokenizer);
       }
 

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java Fri Jan 17 17:23:33 2014
@@ -224,7 +224,8 @@ public class TestToken extends LuceneTes
   }
 
   public void testTokenAttributeFactory() throws Exception {
-    TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, new StringReader("foo bar"), MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+    TokenStream ts = new MockTokenizer(Token.TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
+    ((Tokenizer)ts).setReader(new StringReader("foo bar"));
     
     assertTrue("SenselessAttribute is not implemented by SenselessAttributeImpl",
       ts.addAttribute(SenselessAttribute.class) instanceof SenselessAttributeImpl);

Modified: lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java?rev=1559196&r1=1559195&r2=1559196&view=diff
==============================================================================
--- lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java (original)
+++ lucene/dev/branches/lucene5376/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java Fri Jan 17 17:23:33 2014
@@ -69,8 +69,8 @@ public class TestBlockPostingsFormat3 ex
     Directory dir = newDirectory();
     Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
       @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader);
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer();
         if (fieldName.contains("payloadsFixed")) {
           TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1);
           return new TokenStreamComponents(tokenizer, filter);