You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ma...@apache.org on 2010/03/14 21:58:34 UTC

svn commit: r922957 [2/3] - in /lucene/solr/branches/solr: ./ lib/ src/common/org/apache/solr/common/util/ src/java/org/apache/solr/analysis/ src/java/org/apache/solr/handler/ src/java/org/apache/solr/handler/admin/ src/java/org/apache/solr/handler/com...

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Sun Mar 14 20:58:32 2010
@@ -19,12 +19,14 @@ package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.ArrayUtil;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
 
 /**
  * Splits words into subwords and performs optional transformations on subword groups.
@@ -50,118 +52,108 @@ import java.util.List;
  *     - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
  *
  *  One use for WordDelimiterFilter is to help match words with different subword delimiters.
- *  For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi"
- *  queries to all match.
- *  One way of doing so is to specify combinations="1" in the analyzer
- *  used for indexing, and combinations="0" (the default) in the analyzer
- *  used for querying.  Given that the current StandardTokenizer
- *  immediately removes many intra-word delimiters, it is recommended that
- *  this filter be used after a tokenizer that does not do this
- *  (such as WhitespaceTokenizer).
+ *  For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
+ *  One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
+ *  in the analyzer used for querying.  Given that the current StandardTokenizer immediately removes many intra-word
+ *  delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
  *
  *  @version $Id$
  */
 
 final class WordDelimiterFilter extends TokenFilter {
-  private final byte[] charTypeTable;
-
-  public static final int         LOWER=0x01;
-  public static final int         UPPER=0x02;
-  public static final int         DIGIT=0x04;
-  public static final int SUBWORD_DELIM=0x08;
+  
+  public static final int LOWER = 0x01;
+  public static final int UPPER = 0x02;
+  public static final int DIGIT = 0x04;
+  public static final int SUBWORD_DELIM = 0x08;
 
   // combinations: for testing, not for setting bits
-  public static final int    ALPHA=0x03;
-  public static final int    ALPHANUM=0x07;
-
-  // TODO: should there be a WORD_DELIM category for
-  // chars that only separate words (no catenation of subwords
-  // will be done if separated by these chars?)
-  // "," would be an obvious candidate...
-
-  static byte[] defaultWordDelimTable;
-  static {
-    byte[] tab = new byte[256];
-    for (int i=0; i<256; i++) {
-      byte code = 0;
-      if (Character.isLowerCase(i)) code |= LOWER;
-      else if (Character.isUpperCase(i)) code |= UPPER;
-      else if (Character.isDigit(i)) code |= DIGIT;
-      if (code==0) code=SUBWORD_DELIM;
-      tab[i]=code;
-    }
-    defaultWordDelimTable = tab;
-  }
+  public static final int ALPHA = 0x03;
+  public static final int ALPHANUM = 0x07;
 
   /**
-   * If 1, causes parts of words to be generated:
+   * If true, causes parts of words to be generated:
    * <p/>
    * "PowerShot" => "Power" "Shot"
    */
-  final int generateWordParts;
+  final boolean generateWordParts;
 
   /**
-   * If 1, causes number subwords to be generated:
+   * If true, causes number subwords to be generated:
    * <p/>
    * "500-42" => "500" "42"
    */
-  final int generateNumberParts;
+  final boolean generateNumberParts;
 
   /**
-   * If 1, causes maximum runs of word parts to be catenated:
+   * If true, causes maximum runs of word parts to be catenated:
    * <p/>
    * "wi-fi" => "wifi"
    */
-  final int catenateWords;
+  final boolean catenateWords;
 
   /**
-   * If 1, causes maximum runs of number parts to be catenated:
+   * If true, causes maximum runs of number parts to be catenated:
    * <p/>
    * "500-42" => "50042"
    */
-  final int catenateNumbers;
+  final boolean catenateNumbers;
 
   /**
-   * If 1, causes all subword parts to be catenated:
+   * If true, causes all subword parts to be catenated:
    * <p/>
    * "wi-fi-4000" => "wifi4000"
    */
-  final int catenateAll;
+  final boolean catenateAll;
 
   /**
-   * If 0, causes case changes to be ignored (subwords will only be generated
-   * given SUBWORD_DELIM tokens). (Defaults to 1)
-   */
-  final int splitOnCaseChange;
-
-  /**
-   * If 1, original words are preserved and added to the subword list (Defaults to 0)
+   * If true, original words are preserved and added to the subword list (Defaults to false)
    * <p/>
    * "500-42" => "500" "42" "500-42"
    */
-  final int preserveOriginal;
-
-  /**
-   * If 0, causes numeric changes to be ignored (subwords will only be generated
-   * given SUBWORD_DELIM tokens). (Defaults to 1)
-   */
-  final int splitOnNumerics;
-
-  /**
-   * If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
-   * <p/>
-   * "O'Neil's" => "O", "Neil"
-   */
-  final int stemEnglishPossessive;
+  final boolean preserveOriginal;
   
   /**
    * If not null is the set of tokens to protect from being delimited
    *
    */
   final CharArraySet protWords;
+    
+  private final TermAttribute termAtttribute = (TermAttribute) addAttribute(TermAttribute.class);
+  private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAttribute = (TypeAttribute) addAttribute(TypeAttribute.class);
+
+  // used for iterating word delimiter breaks
+  private final WordDelimiterIterator iterator;
+
+  // used for concatenating runs of similar typed subwords (word,number)
+  private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+  // number of subwords last output by concat.
+  private int lastConcatCount = 0;
+
+  // used for catenate all
+  private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
+
+  // used for accumulating position increment gaps
+  private int accumPosInc = 0;
+
+  private char savedBuffer[] = new char[1024];
+  private int savedStartOffset;
+  private int savedEndOffset;
+  private String savedType;
+  private boolean hasSavedState = false;
+  // if length by start + end offsets doesn't match the term text then assume
+  // this is a synonym and don't adjust the offsets.
+  private boolean hasIllegalOffsets = false;
+
+  // for a run of the same subword type within a word, have we output anything?
+  private boolean hasOutputToken = false;
+  // when preserve original is on, have we output any token following it?
+  // this token must have posInc=0!
+  private boolean hasOutputFollowingOriginal = false;
 
   /**
-   *
    * @param in Token stream to be filtered.
    * @param charTypeTable
    * @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
@@ -175,19 +167,27 @@ final class WordDelimiterFilter extends 
    * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             int stemEnglishPossessive,
+                             CharArraySet protWords) {
     super(in);
-    this.generateWordParts = generateWordParts;
-    this.generateNumberParts = generateNumberParts;
-    this.catenateWords = catenateWords;
-    this.catenateNumbers = catenateNumbers;
-    this.catenateAll = catenateAll;
-    this.splitOnCaseChange = splitOnCaseChange;
-    this.preserveOriginal = preserveOriginal;
-    this.charTypeTable = charTypeTable;
-    this.splitOnNumerics = splitOnNumerics;
-    this.stemEnglishPossessive = stemEnglishPossessive;
+    this.generateWordParts = generateWordParts != 0;
+    this.generateNumberParts = generateNumberParts != 0;
+    this.catenateWords = catenateWords != 0;
+    this.catenateNumbers = catenateNumbers != 0;
+    this.catenateAll = catenateAll != 0;
+    this.preserveOriginal = preserveOriginal != 0;
     this.protWords = protWords;
+    this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
   }
   
   /**
@@ -198,8 +198,18 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
-    this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, 1, null);
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             CharArraySet protWords) {
+    this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, 1, 1, null);
   }
 
   /**
@@ -210,8 +220,16 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
-    this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal, 1, null);
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal) {
+    this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, 1, null);
   }
 
   /**
@@ -227,8 +245,18 @@ final class WordDelimiterFilter extends 
    * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet protWords) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             int stemEnglishPossessive,
+                             CharArraySet protWords) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
   }
   
   /**
@@ -237,8 +265,17 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             CharArraySet protWords) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, protWords);
   }
 
   /**   * Compatibility constructor
@@ -248,8 +285,15 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal);
   }
   /**
    * Compatibility constructor
@@ -259,7 +303,13 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll) {
     this(in, charTypeTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
   }
   /**
@@ -270,407 +320,365 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
   }
+  
+  public boolean incrementToken() throws IOException {
+    while (true) {
+      if (!hasSavedState) {
+        // process a new input word
+        if (!input.incrementToken()) {
+          return false;
+        }
 
+        int termLength = termAtttribute.termLength();
+        char[] termBuffer = termAtttribute.termBuffer();
+        
+        accumPosInc += posIncAttribute.getPositionIncrement();
+
+        iterator.setText(termBuffer, termLength);
+        iterator.next();
+
+        // word of no delimiters, or protected word: just return it
+        if ((iterator.current == 0 && iterator.end == termLength) ||
+            (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          return true;
+        }
+        
+        // word of simply delimiters
+        if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
+          // if the posInc is 1, simply ignore it in the accumulation
+          if (posIncAttribute.getPositionIncrement() == 1) {
+            accumPosInc--;
+          }
+          continue;
+        }
 
-  int charType(int ch) {
-    if (ch<charTypeTable.length) {
-      return charTypeTable[ch];
-    }
-    switch (Character.getType(ch)) {
-      case Character.UPPERCASE_LETTER: return UPPER;
-      case Character.LOWERCASE_LETTER: return LOWER;
-
-      case Character.TITLECASE_LETTER:
-      case Character.MODIFIER_LETTER:
-      case Character.OTHER_LETTER:
-      case Character.NON_SPACING_MARK:
-      case Character.ENCLOSING_MARK:  // depends what it encloses?
-      case Character.COMBINING_SPACING_MARK:
-        return ALPHA; 
-
-      case Character.DECIMAL_DIGIT_NUMBER:
-      case Character.LETTER_NUMBER:
-      case Character.OTHER_NUMBER:
-        return DIGIT;
-
-      // case Character.SPACE_SEPARATOR:
-      // case Character.LINE_SEPARATOR:
-      // case Character.PARAGRAPH_SEPARATOR:
-      // case Character.CONTROL:
-      // case Character.FORMAT:
-      // case Character.PRIVATE_USE:
-
-      case Character.SURROGATE:  // prevent splitting
-        return ALPHA|DIGIT;  
-
-      // case Character.DASH_PUNCTUATION:
-      // case Character.START_PUNCTUATION:
-      // case Character.END_PUNCTUATION:
-      // case Character.CONNECTOR_PUNCTUATION:
-      // case Character.OTHER_PUNCTUATION:
-      // case Character.MATH_SYMBOL:
-      // case Character.CURRENCY_SYMBOL:
-      // case Character.MODIFIER_SYMBOL:
-      // case Character.OTHER_SYMBOL:
-      // case Character.INITIAL_QUOTE_PUNCTUATION:
-      // case Character.FINAL_QUOTE_PUNCTUATION:
+        saveState();
 
-      default: return SUBWORD_DELIM;
+        hasOutputToken = false;
+        hasOutputFollowingOriginal = !preserveOriginal;
+        lastConcatCount = 0;
+        
+        if (preserveOriginal) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          return true;
+        }
+      }
+      
+      // at the end of the string, output any concatenations
+      if (iterator.end == WordDelimiterIterator.DONE) {
+        if (!concat.isEmpty()) {
+          if (flushConcatenation(concat)) {
+            return true;
+          }
+        }
+        
+        if (!concatAll.isEmpty()) {
+          // only if we haven't output this same combo above!
+          if (concatAll.subwordCount > lastConcatCount) {
+            concatAll.writeAndClear();
+            return true;
+          }
+          concatAll.clear();
+        }
+        
+        // no saved concatenations, on to the next input word
+        hasSavedState = false;
+        continue;
+      }
+      
+      // word surrounded by delimiters: always output
+      if (iterator.isSingleWord()) {
+        generatePart(true);
+        iterator.next();
+        return true;
+      }
+      
+      int wordType = iterator.type();
+      
+      // do we already have queued up incompatible concatenations?
+      if (!concat.isEmpty() && (concat.type & wordType) == 0) {
+        if (flushConcatenation(concat)) {
+          hasOutputToken = false;
+          return true;
+        }
+        hasOutputToken = false;
+      }
+      
+      // add subwords depending upon options
+      if (shouldConcatenate(wordType)) {
+        if (concat.isEmpty()) {
+          concat.type = wordType;
+        }
+        concatenate(concat);
+      }
+      
+      // add all subwords (catenateAll)
+      if (catenateAll) {
+        concatenate(concatAll);
+      }
+      
+      // if we should output the word or number part
+      if (shouldGenerateParts(wordType)) {
+        generatePart(false);
+        iterator.next();
+        return true;
+      }
+        
+      iterator.next();
     }
   }
 
-  // use the type of the first char as the type
-  // of the token.
-  private int tokType(Token t) {
-    return charType(t.termBuffer()[0]);
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    hasSavedState = false;
+    concat.clear();
+    concatAll.clear();
+    accumPosInc = 0;
   }
 
-  // There isn't really an efficient queue class, so we will
-  // just use an array for now.
-  private ArrayList<Token> queue = new ArrayList<Token>(4);
-  private int queuePos=0;
-
-  // temporary working queue
-  private ArrayList<Token> tlist = new ArrayList<Token>(4);
+  // ================================================= Helper Methods ================================================
 
+  /**
+   * Saves the existing attribute states
+   */
+  private void saveState() {
+    // otherwise, we have delimiters, save state
+    savedStartOffset = offsetAttribute.startOffset();
+    savedEndOffset = offsetAttribute.endOffset();
+    // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
+    hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAtttribute.termLength());
+    savedType = typeAttribute.type();
 
-  private Token newTok(Token orig, int start, int end) {
-    int startOff = orig.startOffset();
-    int endOff = orig.endOffset();
-    // if length by start + end offsets doesn't match the term text then assume
-    // this is a synonym and don't adjust the offsets.
-    if (orig.termLength() == endOff-startOff) {
-      endOff = startOff + end;
-      startOff += start;     
+    if (savedBuffer.length < termAtttribute.termLength()) {
+      savedBuffer = new char[ArrayUtil.getNextSize(termAtttribute.termLength())];
     }
 
-    return (Token)orig.clone(orig.termBuffer(), start, (end - start), startOff, endOff);
-  }
-
-
-  public final Token next(Token in) throws IOException {
+    System.arraycopy(termAtttribute.termBuffer(), 0, savedBuffer, 0, termAtttribute.termLength());
+    iterator.text = savedBuffer;
 
-    // check the queue first
-    if (queuePos<queue.size()) {
-      return queue.get(queuePos++);
-    }
+    hasSavedState = true;
+  }
 
-    // reset the queue if it had been previously used
-    if (queuePos!=0) {
-      queuePos=0;
-      queue.clear();
+  /**
+   * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
+   *
+   * @param concatenation WordDelimiterConcatenation that will be flushed
+   * @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
+   */
+  private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
+    lastConcatCount = concatenation.subwordCount;
+    if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
+      concatenation.writeAndClear();
+      return true;
     }
+    concatenation.clear();
+    return false;
+  }
 
+  /**
+   * Determines whether to concatenate a word or number if the current word is the given type
+   *
+   * @param wordType Type of the current word used to determine if it should be concatenated
+   * @return {@code true} if concatenation should occur, {@code false} otherwise
+   */
+  private boolean shouldConcatenate(int wordType) {
+    return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
+  }
 
-    // optimize for the common case: assume there will be
-    // no subwords (just a simple word)
-    //
-    // Would it actually be faster to check for the common form
-    // of isLetter() isLower()*, and then backtrack if it doesn't match?
-
-    int origPosIncrement = 0;
-    Token t;
-    while(true) {
-      // t is either returned, or a new token is made from it, so it should
-      // be safe to use the next(Token) method.
-      t = input.next(in);
-      if (t == null) return null;
-
-      char [] termBuffer = t.termBuffer();
-      int len = t.termLength();
-      int start=0;
-      if (len ==0) continue;
-
-      int posInc = t.getPositionIncrement();
-      origPosIncrement += posInc;
-
-      //skip protected tokens
-      if (protWords != null && protWords.contains(termBuffer, 0, len)) {
-        t.setPositionIncrement(origPosIncrement);
-        return t;
-      }
-
-      // Avoid calling charType more than once for each char (basically
-      // avoid any backtracking).
-      // makes code slightly more difficult, but faster.
-      int ch=termBuffer[start];
-      int type=charType(ch);
-
-      int numWords=0;
-
-      while (start< len) {
-        // first eat delimiters at the start of this subword
-        while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
-          ch=termBuffer[start];
-          type=charType(ch);
-        }
-
-        int pos=start;
-
-        // save the type of the first char of the subword
-        // as a way to tell what type of subword token this is (number, word, etc)
-        int firstType=type;
-        int lastType=type;  // type of the previously read char
-
-
-        while (pos< len) {
-
-          if ((type & lastType)==0) {  // no overlap in character type
-            // check and remove "'s" from the end of a token.
-            // the pattern to check for is
-            //   ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
-            if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
-              if (ch=='\'' && pos+1< len
-                      && (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
-              {
-                int subWordEnd=pos;
-                if (pos+2>= len) {
-                  // end of string detected after "'s"
-                  pos+=2;
-                } else {
-                  // make sure that a delimiter follows "'s"
-                  int ch2 = termBuffer[pos+2];
-                  int type2 = charType(ch2);
-                  if ((type2 & SUBWORD_DELIM)!=0) {
-                    // if delimiter, move position pointer
-                    // to it (skipping over "'s"
-                    ch=ch2;
-                    type=type2;
-                    pos+=2;
-                  }
-                }
-
-                queue.add(newTok(t,start,subWordEnd));
-                if ((firstType & ALPHA)!=0) numWords++;
-                break;
-              }
-            }
-
-            // For case changes, only split on a transition from
-            // lower to upper case, not vice-versa.
-            // That will correctly handle the
-            // case of a word starting with a capital (won't split).
-            // It will also handle pluralization of
-            // an uppercase word such as FOOs (won't split).
-
-            if (splitOnCaseChange == 0 && 
-                (lastType & ALPHA) != 0 && (type & ALPHA) != 0) {
-              // ALPHA->ALPHA: always ignore if case isn't considered.
-            } else if ((lastType & UPPER)!=0 && (type & ALPHA)!=0) {
-              // UPPER->letter: Don't split
-            } else if(splitOnNumerics == 0 &&
-                ( ((lastType &  ALPHA) != 0 && (type & DIGIT) != 0) || ((lastType &  DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
-              // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
-            } else {
-              // NOTE: this code currently assumes that only one flag
-              // is set for each character now, so we don't have
-              // to explicitly check for all the classes of transitions
-              // listed below.
-
-              // LOWER->UPPER
-              // ALPHA->NUMERIC
-              // NUMERIC->ALPHA
-              // *->DELIMITER
-              queue.add(newTok(t,start,pos));
-              if ((firstType & ALPHA)!=0) numWords++;
-              break;
-            }
-          }
-
-          if (++pos >= len) {
-            if (start==0) {
-              // the subword is the whole original token, so
-              // return it unchanged.
-              t.setPositionIncrement(origPosIncrement);
-              return t;
-            }
-
-            // optimization... if this is the only token,
-            // return it immediately.
-            if (queue.size()==0 && preserveOriginal == 0) {
-              // just adjust the text w/o changing the rest
-              // of the original token
-              t.setTermBuffer(termBuffer, start, len-start);
-              t.setStartOffset(t.startOffset() + start);
-              t.setPositionIncrement(origPosIncrement);              
-              return t;
-            }
-
-            Token newtok = newTok(t,start,pos);
-
-            queue.add(newtok);
-            if ((firstType & ALPHA)!=0) numWords++;
-            break;
-          }
-
-          lastType = type;
-          ch = termBuffer[pos];
-          type = charType(ch);
-        }
-
-        // start of the next subword is the current position
-        start=pos;
-      }
-
-      // System.out.println("##########TOKEN=" + s + " ######### WORD DELIMITER QUEUE=" + str(queue));
-
-      final int numtok = queue.size();
-
-      // We reached the end of the current token.
-      // If the queue is empty, we should continue by reading
-      // the next token
-      if (numtok==0) {
-        // the token might have been all delimiters, in which
-        // case return it if we're meant to preserve it
-        if (preserveOriginal != 0) {
-          return t;
-        }
-
-        // if this token had a "normal" gap of 1, remove it.
-        if (posInc==1) origPosIncrement-=1;
-        continue;
-      }
-
-      // if number of tokens is 1, there are no catenations to be done.
-      if (numtok==1) {
-        break;
-      }
-
-
-      final int numNumbers = numtok - numWords;
+  /**
+   * Determines whether a word/number part should be generated for a word of the given type
+   *
+   * @param wordType Type of the word used to determine if a word/number part should be generated
+   * @return {@code true} if a word/number part should be generated, {@code false} otherwise
+   */
+  private boolean shouldGenerateParts(int wordType) {
+    return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
+  }
 
-      // check conditions under which the current token
-      // queue may be used as-is (no catenations needed)
-      if (catenateAll==0    // no "everything" to catenate
-        && (catenateWords==0 || numWords<=1)   // no words to catenate
-        && (catenateNumbers==0 || numNumbers<=1)    // no numbers to catenate
-        && (generateWordParts!=0 || numWords==0)  // word generation is on
-        && (generateNumberParts!=0 || numNumbers==0)) // number generation is on
-      {
-        break;
-      }
+  /**
+   * Concatenates the saved buffer to the given WordDelimiterConcatenation
+   *
+   * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
+   */
+  private void concatenate(WordDelimiterConcatenation concatenation) {
+    if (concatenation.isEmpty()) {
+      concatenation.startOffset = savedStartOffset + iterator.current;
+    }
+    concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
+    concatenation.endOffset = savedStartOffset + iterator.end;
+  }
 
+  /**
+   * Generates a word/number part, updating the appropriate attributes
+   *
+   * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
+   */
+  private void generatePart(boolean isSingleWord) {
+    clearAttributes();
+    termAtttribute.setTermBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
 
-      // swap queue and the temporary working list, then clear the
-      // queue in preparation for adding all combinations back to it.
-      ArrayList<Token> tmp=tlist;
-      tlist=queue;
-      queue=tmp;
-      queue.clear();
-
-      if (numWords==0) {
-        // all numbers
-        addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || catenateAll!=0, 1);
-        if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
-      } else if (numNumbers==0) {
-        // all words
-        addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || catenateAll!=0, 1);
-        if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
-      } else if (generateNumberParts==0 && generateWordParts==0 && catenateNumbers==0 && catenateWords==0) {
-        // catenate all *only*
-        // OPT:could be optimized to add to current queue...
-        addCombos(tlist,0,numtok,false,catenateAll!=0, 1);
-        if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
-      }
+    int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
+    int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
 
-      //
-      // Find all adjacent tokens of the same type.
-      //
-      Token tok = tlist.get(0);
-      boolean isWord = (tokType(tok) & ALPHA) != 0;
-      boolean wasWord=isWord;
-
-      for(int i=0; i<numtok;) {
-          int j;
-          for (j=i+1; j<numtok; j++) {
-            wasWord=isWord;
-            tok = tlist.get(j);
-            isWord = (tokType(tok) & ALPHA) != 0;
-            if (isWord != wasWord) break;
-          }
-          if (wasWord) {
-            addCombos(tlist,i,j,generateWordParts!=0,catenateWords!=0,1);
-          } else {
-            addCombos(tlist,i,j,generateNumberParts!=0,catenateNumbers!=0,1);
-          }
-          i=j;
-      }
+    offsetAttribute.setOffset(startOffSet, endOffSet);
+    posIncAttribute.setPositionIncrement(position(false));
+    typeAttribute.setType(savedType);
+  }
 
-      // take care catenating all subwords
-      if (catenateAll!=0) {
-        addCombos(tlist,0,numtok,false,true,0);
-      }
+  /**
+   * Get the position increment gap for a subword or concatenation
+   *
+   * @param inject true if this token wants to be injected
+   * @return position increment gap
+   */
+  private int position(boolean inject) {
+    int posInc = accumPosInc;
 
-      // NOTE: in certain cases, queue may be empty (for instance, if catenate
-      // and generate are both set to false).  Only exit the loop if the queue
-      // is not empty.
-      if (queue.size() > 0 || preserveOriginal!=0) break;
+    if (hasOutputToken) {
+      accumPosInc = 0;
+      return inject ? 0 : Math.max(1, posInc);
     }
 
-    // System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
-
-    if (preserveOriginal != 0) {
-      queuePos = 0;
-      if (queue.size() > 0) {
-        // overlap first token with the original
-        queue.get(0).setPositionIncrement(0);
-      }
-      return t;  // return the original
-    } else {
-      queuePos=1;
-      Token tok = queue.get(0);
-      tok.setPositionIncrement(origPosIncrement);
-      return tok;
+    hasOutputToken = true;
+    
+    if (!hasOutputFollowingOriginal) {
+      // the first token following the original is 0 regardless
+      hasOutputFollowingOriginal = true;
+      return 0;
     }
+    // clear the accumulated position increment
+    accumPosInc = 0;
+    return Math.max(1, posInc);
   }
 
+  /**
+   * Checks if the given word type includes {@link #ALPHA}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains ALPHA, {@code false} otherwise
+   */
+  static boolean isAlpha(int type) {
+    return (type & ALPHA) != 0;
+  }
 
-  // index "a","b","c" as  pos0="a", pos1="b", pos2="c", pos2="abc"
-  private void addCombos(List<Token> lst, int start, int end, boolean generateSubwords, boolean catenateSubwords, int posOffset) {
-    if (end-start==1) {
-      // always generate a word alone, even if generateSubwords=0 because
-      // the catenation of all the subwords *is* the subword.
-      queue.add(lst.get(start));
-      return;
-    }
-
-    StringBuilder sb = null;
-    if (catenateSubwords) sb=new StringBuilder();
-    Token firstTok=null;
-    Token tok=null;
-    for (int i=start; i<end; i++) {
-      tok = lst.get(i);
-      if (catenateSubwords) {
-        if (i==start) firstTok=tok;
-        sb.append(tok.termBuffer(), 0, tok.termLength());
-      }
-      if (generateSubwords) {
-        queue.add(tok);
-      }
-    }
+  /**
+   * Checks if the given word type includes {@link #DIGIT}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains DIGIT, {@code false} otherwise
+   */
+  static boolean isDigit(int type) {
+    return (type & DIGIT) != 0;
+  }
 
-    if (catenateSubwords) {
-      Token concatTok = new Token(sb.toString(),
-              firstTok.startOffset(),
-              tok.endOffset(),
-              firstTok.type());
-      // if we indexed some other tokens, then overlap concatTok with the last.
-      // Otherwise, use the value passed in as the position offset.
-      concatTok.setPositionIncrement(generateSubwords==true ? 0 : posOffset);
-      queue.add(concatTok);
-    }
+  /**
+   * Checks if the given word type includes {@link #SUBWORD_DELIM}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
+   */
+  static boolean isSubwordDelim(int type) {
+    return (type & SUBWORD_DELIM) != 0;
   }
 
-  @Override
-  public void reset() throws IOException {
-    input.reset();
-    queuePos=0;
-    queue.clear();    
+  /**
+   * Checks if the given word type includes {@link #UPPER}
+   *
+   * @param type Word type to check
+   * @return {@code true} if the type contains UPPER, {@code false} otherwise
+   */
+  static boolean isUpper(int type) {
+    return (type & UPPER) != 0;
   }
 
+  // ================================================= Inner Classes =================================================
+
+  /**
+   * A WDF concatenated 'run'
+   */
+  final class WordDelimiterConcatenation {
+    final StringBuilder buffer = new StringBuilder();
+    int startOffset;
+    int endOffset;
+    int type;
+    int subwordCount;
+
+    /**
+     * Appends the given text of the given length, to the concetenation at the given offset
+     *
+     * @param text Text to append
+     * @param offset Offset in the concetenation to add the text
+     * @param length Length of the text to append
+     */
+    void append(char text[], int offset, int length) {
+      buffer.append(text, offset, length);
+      subwordCount++;
+    }
+
+    /**
+     * Writes the concatenation to the attributes
+     */
+    void write() {
+      clearAttributes();
+      if (termAtttribute.termLength() < buffer.length()) {
+        termAtttribute.resizeTermBuffer(buffer.length());
+      }
+      char termbuffer[] = termAtttribute.termBuffer();
+      
+      buffer.getChars(0, buffer.length(), termbuffer, 0);
+      termAtttribute.setTermLength(buffer.length());
+        
+      if (hasIllegalOffsets) {
+        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+      }
+      else {
+        offsetAttribute.setOffset(startOffset, endOffset);
+      }
+      posIncAttribute.setPositionIncrement(position(true));
+      typeAttribute.setType(savedType);
+      accumPosInc = 0;
+    }
+
+    /**
+     * Determines if the concatenation is empty
+     *
+     * @return {@code true} if the concatenation is empty, {@code false} otherwise
+     */
+    boolean isEmpty() {
+      return buffer.length() == 0;
+    }
+
+    /**
+     * Clears the concatenation and resets its state
+     */
+    void clear() {
+      buffer.setLength(0);
+      startOffset = endOffset = type = subwordCount = 0;
+    }
+
+    /**
+     * Convenience method for the common scenario of having to write the concetenation and then clearing its state
+     */
+    void writeAndClear() {
+      write();
+      clear();
+    }
+  }
   // questions:
   // negative numbers?  -42 indexed as just 42?
   // dollar sign?  $42

Added: lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java?rev=922957&view=auto
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java (added)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java Sun Mar 14 20:58:32 2010
@@ -0,0 +1,315 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.solr.analysis.WordDelimiterFilter.*;
+
+/**
+ * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
+ */
+final class WordDelimiterIterator {
+
+  /** Indicates the end of iteration */
+  public static final int DONE = -1;
+  
+  public static final byte[] DEFAULT_WORD_DELIM_TABLE;
+
+  char text[];
+  int length;
+  
+  /** start position of text, excluding leading delimiters */
+  int startBounds;
+  /** end position of text, excluding trailing delimiters */
+  int endBounds;
+  
+  /** Beginning of subword */
+  int current;
+  /** End of subword */
+  int end;
+  
+  /* does this string end with a possessive such as 's */
+  private boolean hasFinalPossessive = false;
+  
+  /**
+   * If false, causes case changes to be ignored (subwords will only be generated
+   * given SUBWORD_DELIM tokens). (Defaults to true)
+   */
+  final boolean splitOnCaseChange;
+  
+  /**
+   * If false, causes numeric changes to be ignored (subwords will only be generated
+   * given SUBWORD_DELIM tokens). (Defaults to true)
+   */
+  final boolean splitOnNumerics;
+
+  /**
+   * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
+   * <p/>
+   * "O'Neil's" => "O", "Neil"
+   */
+  final boolean stemEnglishPossessive;
+  
+  private final byte[] charTypeTable;
+  
+  /** if true, need to skip over a possessive found in the last call to next() */
+  private boolean skipPossessive = false;
+
+  // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
+  // done if separated by these chars?) "," would be an obvious candidate...
+  static {
+    byte[] tab = new byte[256];
+    for (int i = 0; i < 256; i++) {
+      byte code = 0;
+      if (Character.isLowerCase(i)) {
+        code |= LOWER;
+      }
+      else if (Character.isUpperCase(i)) {
+        code |= UPPER;
+      }
+      else if (Character.isDigit(i)) {
+        code |= DIGIT;
+      }
+      if (code == 0) {
+        code = SUBWORD_DELIM;
+      }
+      tab[i] = code;
+    }
+    DEFAULT_WORD_DELIM_TABLE = tab;
+  }
+
+  /**
+   * Create a new WordDelimiterIterator operating with the supplied rules.
+   * 
+   * @param charTypeTable table containing character types
+   * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
+   * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
+   * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
+   */
+  WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
+    this.charTypeTable = charTypeTable;
+    this.splitOnCaseChange = splitOnCaseChange;
+    this.splitOnNumerics = splitOnNumerics;
+    this.stemEnglishPossessive = stemEnglishPossessive;
+  }
+  
+  /**
+   * Advance to the next subword in the string.
+   *
+   * @return index of the next subword, or {@link #DONE} if all subwords have been returned
+   */
+  int next() {
+    current = end;
+    if (current == DONE) {
+      return DONE;
+    }
+    
+    if (skipPossessive) {
+      current += 2;
+      skipPossessive = false;
+    }
+
+    int lastType = 0;
+    
+    while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
+      current++;
+    }
+
+    if (current >= endBounds) {
+      return end = DONE;
+    }
+    
+    for (end = current + 1; end < endBounds; end++) {
+      int type = charType(text[end]);
+      if (isBreak(lastType, type)) {
+        break;
+      }
+      lastType = type;
+    }
+    
+    if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
+      skipPossessive = true;
+    }
+    
+    return end;
+  }
+
+
+  /**
+   * Return the type of the current subword.
+   * This currently uses the type of the first character in the subword.
+   *
+   * @return type of the current word
+   */
+  int type() {
+    if (end == DONE) {
+      return 0;
+    }
+    
+    int type = charType(text[current]);
+    switch (type) {
+      // return ALPHA word type for both lower and upper
+      case LOWER:
+      case UPPER:
+        return ALPHA;
+      default:
+        return type;
+    }
+  }
+
+  /**
+   * Reset the text to a new value, and reset all state
+   *
+   * @param text New text
+   * @param length length of the text
+   */
+  void setText(char text[], int length) {
+    this.text = text;
+    this.length = this.endBounds = length;
+    current = startBounds = end = 0;
+    skipPossessive = hasFinalPossessive = false;
+    setBounds();
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  /**
+   * Determines whether the transition from lastType to type indicates a break
+   *
+   * @param lastType Last subword type
+   * @param type Current subword type
+   * @return {@code true} if the transition indicates a break, {@code false} otherwise
+   */
+  private boolean isBreak(int lastType, int type) {
+    if ((type & lastType) != 0) {
+      return false;
+    }
+    
+    if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
+      // ALPHA->ALPHA: always ignore if case isn't considered.
+      return false;
+    } else if (isUpper(lastType) && isAlpha(type)) {
+      // UPPER->letter: Don't split
+      return false;
+    } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
+      // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
+      return false;
+    }
+
+    return true;
+  }
+  
+  /**
+   * Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
+   *
+   * @return {@code true} if the current word contains only one subword, {@code false} otherwise
+   */
+  boolean isSingleWord() {
+    if (hasFinalPossessive) {
+      return current == startBounds && end == endBounds - 2;
+    }
+    else {
+      return current == startBounds && end == endBounds;
+    }
+  }
+   
+  /**
+   * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
+   * it yet, simply note it.
+   */
+  private void setBounds() {
+    while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
+      startBounds++;
+    }
+    
+    while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
+      endBounds--;
+    }
+    if (endsWithPossessive(endBounds)) {
+      hasFinalPossessive = true;
+    }
+    current = startBounds;
+  }
+  
+  /**
+   * Determines if the text at the given position indicates an English possessive which should be removed
+   *
+   * @param pos Position in the text to check if it indicates an English possessive
+   * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
+   */
+  private boolean endsWithPossessive(int pos) {
+    return (stemEnglishPossessive &&
+            pos > 2 &&
+            text[pos - 2] == '\'' &&
+            (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
+            isAlpha(charType(text[pos - 3])) &&
+            (pos == endBounds || isSubwordDelim(charType(text[pos]))));
+  }
+
+  /**
+   * Determines the type of the given character
+   *
+   * @param ch Character whose type is to be determined
+   * @return Type of the character
+   */
+  private int charType(int ch) {
+    if (ch < charTypeTable.length) {
+      return charTypeTable[ch];
+    }
+    switch (Character.getType(ch)) {
+      case Character.UPPERCASE_LETTER: return UPPER;
+      case Character.LOWERCASE_LETTER: return LOWER;
+
+      case Character.TITLECASE_LETTER:
+      case Character.MODIFIER_LETTER:
+      case Character.OTHER_LETTER:
+      case Character.NON_SPACING_MARK:
+      case Character.ENCLOSING_MARK:  // depends what it encloses?
+      case Character.COMBINING_SPACING_MARK:
+        return ALPHA; 
+
+      case Character.DECIMAL_DIGIT_NUMBER:
+      case Character.LETTER_NUMBER:
+      case Character.OTHER_NUMBER:
+        return DIGIT;
+
+      // case Character.SPACE_SEPARATOR:
+      // case Character.LINE_SEPARATOR:
+      // case Character.PARAGRAPH_SEPARATOR:
+      // case Character.CONTROL:
+      // case Character.FORMAT:
+      // case Character.PRIVATE_USE:
+
+      case Character.SURROGATE:  // prevent splitting
+        return ALPHA|DIGIT;  
+
+      // case Character.DASH_PUNCTUATION:
+      // case Character.START_PUNCTUATION:
+      // case Character.END_PUNCTUATION:
+      // case Character.CONNECTOR_PUNCTUATION:
+      // case Character.OTHER_PUNCTUATION:
+      // case Character.MATH_SYMBOL:
+      // case Character.CURRENCY_SYMBOL:
+      // case Character.MODIFIER_SYMBOL:
+      // case Character.OTHER_SYMBOL:
+      // case Character.INITIAL_QUOTE_PUNCTUATION:
+      // case Character.FINAL_QUOTE_PUNCTUATION:
+
+      default: return SUBWORD_DELIM;
+    }
+  }
+}
\ No newline at end of file

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandler.java Sun Mar 14 20:58:32 2010
@@ -20,6 +20,12 @@ import org.apache.commons.io.IOUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.SolrParams;
@@ -132,15 +138,20 @@ public class AnalysisRequestHandler exte
   static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
     // outer is namedList since order of tokens is important
     NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();
-    Token t = null;
-    while (((t = tstream.next()) != null)) {
+    // TODO: support custom attributes
+    TermAttribute termAtt = (TermAttribute) tstream.addAttribute(TermAttribute.class);
+    OffsetAttribute offsetAtt = (OffsetAttribute) tstream.addAttribute(OffsetAttribute.class);
+    TypeAttribute typeAtt = (TypeAttribute) tstream.addAttribute(TypeAttribute.class);
+    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tstream.addAttribute(PositionIncrementAttribute.class);
+    
+    while (tstream.incrementToken()) {
       NamedList<Object> token = new SimpleOrderedMap<Object>();
       tokens.add("token", token);
-      token.add("value", new String(t.termBuffer(), 0, t.termLength()));
-      token.add("start", t.startOffset());
-      token.add("end", t.endOffset());
-      token.add("posInc", t.getPositionIncrement());
-      token.add("type", t.type());
+      token.add("value", new String(termAtt.termBuffer(), 0, termAtt.termLength()));
+      token.add("start", offsetAtt.startOffset());
+      token.add("end", offsetAtt.endOffset());
+      token.add("posInc", posIncAtt.getPositionIncrement());
+      token.add("type", typeAtt.type());
       //TODO: handle payloads
     }
     return tokens;

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java Sun Mar 14 20:58:32 2010
@@ -22,6 +22,12 @@ import org.apache.lucene.analysis.CharRe
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.solr.analysis.CharFilterFactory;
 import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerChain;
@@ -141,11 +147,30 @@ public abstract class AnalysisRequestHan
    */
   private List<Token> analyzeTokenStream(TokenStream tokenStream) {
     List<Token> tokens = new ArrayList<Token>();
-    Token reusableToken = new Token();
-    Token token = null;
 
+    // TODO change this API to support custom attributes
+    TermAttribute termAtt = (TermAttribute) 
+      tokenStream.addAttribute(TermAttribute.class);
+    OffsetAttribute offsetAtt = (OffsetAttribute) 
+      tokenStream.addAttribute(OffsetAttribute.class);
+    TypeAttribute typeAtt = (TypeAttribute) 
+      tokenStream.addAttribute(TypeAttribute.class);
+    FlagsAttribute flagsAtt = (FlagsAttribute) 
+      tokenStream.addAttribute(FlagsAttribute.class);
+    PayloadAttribute payloadAtt = (PayloadAttribute) 
+      tokenStream.addAttribute(PayloadAttribute.class);
+    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) 
+      tokenStream.addAttribute(PositionIncrementAttribute.class);
+    
     try {
-      while ((token = tokenStream.next(reusableToken)) != null) {
+      while (tokenStream.incrementToken()) {
+        Token token = new Token();
+        token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+        token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
+        token.setType(typeAtt.type());
+        token.setFlags(flagsAtt.getFlags());
+        token.setPayload(payloadAtt.getPayload());
+        token.setPositionIncrement(posIncAtt.getPositionIncrement());
         tokens.add((Token) token.clone());
       }
     } catch (IOException ioe) {
@@ -229,16 +254,30 @@ public abstract class AnalysisRequestHan
   /**
    * TokenStream that iterates over a list of pre-existing Tokens
    */
+  // TODO refactor to support custom attributes
   protected static class ListBasedTokenStream extends TokenStream {
+    private final List<Token> tokens;
+    private Iterator<Token> tokenIterator;
 
-    private final Iterator<Token> tokenIterator;
-
+    private final TermAttribute termAtt = (TermAttribute) 
+      addAttribute(TermAttribute.class);
+    private final OffsetAttribute offsetAtt = (OffsetAttribute) 
+      addAttribute(OffsetAttribute.class);
+    private final TypeAttribute typeAtt = (TypeAttribute) 
+      addAttribute(TypeAttribute.class);
+    private final FlagsAttribute flagsAtt = (FlagsAttribute) 
+      addAttribute(FlagsAttribute.class);
+    private final PayloadAttribute payloadAtt = (PayloadAttribute) 
+      addAttribute(PayloadAttribute.class);
+    private final PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) 
+      addAttribute(PositionIncrementAttribute.class);
     /**
      * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
      *
      * @param tokens Source of tokens to be used
      */
     ListBasedTokenStream(List<Token> tokens) {
+      this.tokens = tokens;
       tokenIterator = tokens.iterator();
     }
 
@@ -246,8 +285,25 @@ public abstract class AnalysisRequestHan
      * {@inheritDoc}
      */
     @Override
-    public Token next(Token token) throws IOException {
-      return (tokenIterator.hasNext()) ? tokenIterator.next() : null;
+    public boolean incrementToken() throws IOException {
+      if (tokenIterator.hasNext()) {
+        Token next = tokenIterator.next();
+        termAtt.setTermBuffer(next.termBuffer(), 0, next.termLength());
+        typeAtt.setType(next.type());
+        offsetAtt.setOffset(next.startOffset(), next.endOffset());
+        flagsAtt.setFlags(next.getFlags());
+        payloadAtt.setPayload(next.getPayload());
+        posIncAtt.setPositionIncrement(next.getPositionIncrement());
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      tokenIterator = tokens.iterator();
     }
   }
 

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java Sun Mar 14 20:58:32 2010
@@ -258,7 +258,7 @@ public class SpellCheckerRequestHandler 
         }
         dirDescription = f.getAbsolutePath();
         log.info("using spell directory: " + dirDescription);
-        spellcheckerIndexDir = FSDirectory.getDirectory(f);
+        spellcheckerIndexDir = FSDirectory.open(f);
       } else {
         log.info("using RAM based spell directory");
       }

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java Sun Mar 14 20:58:32 2010
@@ -40,7 +40,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ConstantScoreRangeQuery;
+import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.PriorityQueue;
@@ -172,7 +172,8 @@ public class LukeRequestHandler extends 
     flags.append( (f != null && f.getOmitNorms())                  ? FieldFlag.OMIT_NORMS.getAbbreviation() : '-' );
     flags.append( (f != null && f.isLazy())                        ? FieldFlag.LAZY.getAbbreviation() : '-' );
     flags.append( (f != null && f.isBinary())                      ? FieldFlag.BINARY.getAbbreviation() : '-' );
-    flags.append( (f != null && f.isCompressed())                  ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
+    //nocommit: handle compressed
+    //flags.append( (f != null && f.isCompressed())                  ? FieldFlag.COMPRESSED.getAbbreviation() : '-' );
     flags.append( (false)                                          ? FieldFlag.SORT_MISSING_FIRST.getAbbreviation() : '-' ); // SchemaField Specific
     flags.append( (false)                                          ? FieldFlag.SORT_MISSING_LAST.getAbbreviation() : '-' ); // SchemaField Specific
     return flags.toString();
@@ -312,7 +313,7 @@ public class LukeRequestHandler extends 
 
       // If numTerms==0, the call is just asking for a quick field list
       if( ttinfo != null && sfield != null && sfield.indexed() ) {
-        Query q = new ConstantScoreRangeQuery(fieldName,null,null,false,false); 
+        Query q = new TermRangeQuery(fieldName,null,null,false,false); 
         TopDocs top = searcher.search( q, 1 );
         if( top.totalHits > 0 ) {
           // Find a document with this field
@@ -652,7 +653,7 @@ public class LukeRequestHandler extends 
         }
         
         if( terms.docFreq() > tiq.minFreq ) {
-          tiq.put(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
+          tiq.add(new TopTermQueue.TermInfo(terms.term(), terms.docFreq()));
             if (tiq.size() > numTerms) { // if tiq full
             tiq.pop(); // remove lowest in tiq
             tiq.minFreq = ((TopTermQueue.TermInfo)tiq.top()).docFreq; // reset minFreq

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryComponent.java Sun Mar 14 20:58:32 2010
@@ -33,6 +33,7 @@ import org.apache.solr.common.params.Sha
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.FieldType;
@@ -305,7 +306,6 @@ public class QueryComponent extends Sear
   public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
     if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
       mergeIds(rb, sreq);
-      return;
     }
 
     if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
@@ -399,7 +399,8 @@ public class QueryComponent extends Sear
 
       // Merge the docs via a priority queue so we don't have to sort *all* of the
       // documents... we only need to order the top (rows+start)
-      ShardFieldSortedHitQueue queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
+      ShardFieldSortedHitQueue queue;
+      queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount());
 
       long numFound = 0;
       Float maxScore=null;
@@ -451,7 +452,7 @@ public class QueryComponent extends Sear
 
           shardDoc.sortFieldValues = sortFieldValues;
 
-          queue.insert(shardDoc);
+          queue.insertWithOverflow(shardDoc);
         } // end for-each-doc-in-response
       } // end for-each-response
 

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/QueryElevationComponent.java Sun Mar 14 20:58:32 2010
@@ -38,8 +38,8 @@ import javax.xml.xpath.XPathExpressionEx
 import javax.xml.xpath.XPathFactory;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.*;
@@ -298,10 +298,9 @@ public class QueryElevationComponent ext
     TokenStream tokens = analyzer.reusableTokenStream( "", new StringReader( query ) );
     tokens.reset();
     
-    Token token = tokens.next();
-    while( token != null ) {
-      norm.append( new String(token.termBuffer(), 0, token.termLength()) );
-      token = tokens.next();
+    TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
+    while( tokens.incrementToken() ) {
+      norm.append( termAtt.termBuffer(), 0, termAtt.termLength() );
     }
     return norm.toString();
   }

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java Sun Mar 14 20:58:32 2010
@@ -33,6 +33,12 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.index.IndexReader;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.CommonParams;
@@ -332,7 +338,7 @@ public class SpellCheckComponent extends
       // create token
       SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
       Token token = new Token();
-      token.setTermText(original);
+      token.setTermBuffer(original);
       token.setStartOffset(suggestion.getStartOffset());
       token.setEndOffset(suggestion.getEndOffset());
 
@@ -364,10 +370,24 @@ public class SpellCheckComponent extends
 
   private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
     Collection<Token> result = new ArrayList<Token>();
-    Token token = null;
     TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
     ts.reset();
-    while ((token = ts.next()) != null){
+    // TODO: support custom attributes
+    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+    OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
+    TypeAttribute typeAtt = (TypeAttribute) ts.addAttribute(TypeAttribute.class);
+    FlagsAttribute flagsAtt = (FlagsAttribute) ts.addAttribute(FlagsAttribute.class);
+    PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
+    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
+    
+    while (ts.incrementToken()){
+      Token token = new Token();
+      token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+      token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
+      token.setType(typeAtt.type());
+      token.setFlags(flagsAtt.getFlags());
+      token.setPayload(payloadAtt.getPayload());
+      token.setPositionIncrement(posIncAtt.getPositionIncrement());
       result.add(token);
     }
     return result;

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java Sun Mar 14 20:58:32 2010
@@ -113,7 +113,7 @@ public class TermVectorComponent extends
     IndexSchema schema = rb.req.getSchema();
     String uniqFieldName = schema.getUniqueKeyField().getName();
     //Only load the id field
-    SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet());
+    SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
     while (iter.hasNext()) {
       Integer docId = iter.next();
       NamedList docNL = new NamedList();

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Sun Mar 14 20:58:32 2010
@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Cachin
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.highlight.*;
@@ -39,6 +40,7 @@ import org.apache.lucene.search.vectorhi
 import org.apache.lucene.search.vectorhighlight.FieldQuery;
 import org.apache.lucene.search.vectorhighlight.FragListBuilder;
 import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
+import org.apache.lucene.util.AttributeSource.State;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.HighlightParams;
 import org.apache.solr.common.params.SolrParams;
@@ -512,28 +514,28 @@ public class DefaultSolrHighlighter exte
  */
 class TokenOrderingFilter extends TokenFilter {
   private final int windowSize;
-  private final LinkedList<Token> queue = new LinkedList<Token>();
+  private final LinkedList<OrderedToken> queue = new LinkedList<OrderedToken>();
   private boolean done=false;
-
+  private final OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+  
   protected TokenOrderingFilter(TokenStream input, int windowSize) {
     super(input);
     this.windowSize = windowSize;
   }
 
   @Override
-  public Token next() throws IOException {
+  public boolean incrementToken() throws IOException {
     while (!done && queue.size() < windowSize) {
-      Token newTok = input.next();
-      if (newTok==null) {
-        done=true;
+      if (!input.incrementToken()) {
+        done = true;
         break;
       }
 
       // reverse iterating for better efficiency since we know the
       // list is already sorted, and most token start offsets will be too.
-      ListIterator<Token> iter = queue.listIterator(queue.size());
+      ListIterator<OrderedToken> iter = queue.listIterator(queue.size());
       while(iter.hasPrevious()) {
-        if (newTok.startOffset() >= iter.previous().startOffset()) {
+        if (offsetAtt.startOffset() >= iter.previous().startOffset) {
           // insertion will be before what next() would return (what
           // we just compared against), so move back one so the insertion
           // will be after.
@@ -541,50 +543,82 @@ class TokenOrderingFilter extends TokenF
           break;
         }
       }
-      iter.add(newTok);
+      OrderedToken ot = new OrderedToken();
+      ot.state = captureState();
+      ot.startOffset = offsetAtt.startOffset();
+      iter.add(ot);
     }
 
-    return queue.isEmpty() ? null : queue.removeFirst();
+    if (queue.isEmpty()) {
+      return false;
+    } else {
+      restoreState(queue.removeFirst().state);
+      return true;
+    }
   }
 }
 
+// for TokenOrderingFilter, so it can easily sort by startOffset
+class OrderedToken {
+  State state;
+  int startOffset;
+}
+
 class TermOffsetsTokenStream {
 
   TokenStream bufferedTokenStream = null;
-  Token bufferedToken;
+  OffsetAttribute bufferedOffsetAtt;
+  State bufferedToken;
+  int bufferedStartOffset;
+  int bufferedEndOffset;
   int startOffset;
   int endOffset;
 
   public TermOffsetsTokenStream( TokenStream tstream ){
     bufferedTokenStream = tstream;
+    bufferedOffsetAtt = (OffsetAttribute) bufferedTokenStream.addAttribute(OffsetAttribute.class);
     startOffset = 0;
     bufferedToken = null;
   }
 
   public TokenStream getMultiValuedTokenStream( final int length ){
     endOffset = startOffset + length;
-    return new TokenStream(){
-      Token token;
-      public Token next() throws IOException {
+    return new MultiValuedStream(length);
+  }
+  
+  class MultiValuedStream extends TokenStream {
+    private final int length;
+    OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+
+      MultiValuedStream(int length) { 
+        super(bufferedTokenStream.cloneAttributes());
+        this.length = length;
+      }
+      
+      public boolean incrementToken() throws IOException {
         while( true ){
-          if( bufferedToken == null )
-            bufferedToken = bufferedTokenStream.next();
-          if( bufferedToken == null ) return null;
-          if( startOffset <= bufferedToken.startOffset() &&
-              bufferedToken.endOffset() <= endOffset ){
-            token = bufferedToken;
+          if( bufferedToken == null ) {
+            if (!bufferedTokenStream.incrementToken())
+              return false;
+            bufferedToken = bufferedTokenStream.captureState();
+            bufferedStartOffset = bufferedOffsetAtt.startOffset();
+            bufferedEndOffset = bufferedOffsetAtt.endOffset();
+          }
+          
+          if( startOffset <= bufferedStartOffset &&
+              bufferedEndOffset <= endOffset ){
+            restoreState(bufferedToken);
             bufferedToken = null;
-            token.setStartOffset( token.startOffset() - startOffset );
-            token.setEndOffset( token.endOffset() - startOffset );
-            return token;
+            offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
+            return true;
           }
-          else if( bufferedToken.endOffset() > endOffset ){
+          else if( bufferedEndOffset > endOffset ){
             startOffset += length + 1;
-            return null;
+            return false;
           }
           bufferedToken = null;
         }
       }
-    };
-  }
-}
+
+  };
+};

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/response/BaseResponseWriter.java Sun Mar 14 20:58:32 2010
@@ -176,7 +176,7 @@ public abstract class BaseResponseWriter
       Object val = null;
       if (ft == null) { // handle fields not in the schema
         if (f.isBinary())
-          val = f.binaryValue();
+          val = f.getBinaryValue();
         else
           val = f.stringValue();
       } else {

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/response/BinaryResponseWriter.java Sun Mar 14 20:58:32 2010
@@ -140,7 +140,7 @@ public class BinaryResponseWriter implem
         if(sf != null) ft =sf.getType();
         Object val;
         if (ft == null) {  // handle fields not in the schema
-          if (f.isBinary()) val = f.binaryValue();
+          if (f.isBinary()) val = f.getBinaryValue();
           else val = f.stringValue();
         } else {
           try {

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/CompressableField.java Sun Mar 14 20:58:32 2010
@@ -58,8 +58,10 @@ public abstract class CompressableField 
                                       String internalVal) {
     /* compress field if length exceeds threshold */
     if(field.isCompressed()) {
-      return internalVal.length() >= compressThreshold ? 
-        Field.Store.COMPRESS : Field.Store.YES;
+      // nocommit: handle compression
+      //return internalVal.length() >= compressThreshold ? 
+      //  Field.Store.COMPRESS : Field.Store.YES;
+      return Field.Store.YES;
     } else
       return super.getFieldStore(field, internalVal);
   } 

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/FieldType.java Sun Mar 14 20:58:32 2010
@@ -302,8 +302,8 @@ public abstract class FieldType extends 
   }
   protected Field.Index getFieldIndex(SchemaField field,
                                       String internalVal) {
-    return field.indexed() ? (isTokenized() ? Field.Index.TOKENIZED :
-                              Field.Index.UN_TOKENIZED) : Field.Index.NO;
+    return field.indexed() ? (isTokenized() ? Field.Index.ANALYZED :
+                              Field.Index.NOT_ANALYZED) : Field.Index.NO;
   }
 
   /**

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieDateField.java Sun Mar 14 20:58:32 2010
@@ -63,7 +63,7 @@ public class TrieDateField extends DateF
 
   @Override
   public Date toObject(Fieldable f) {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,TrieField.badFieldString(f));
     return new Date(TrieField.toLong(arr));
   }
@@ -85,7 +85,7 @@ public class TrieDateField extends DateF
 
   @Override
   public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) {
       xmlWriter.writeStr(name, TrieField.badFieldString(f));
       return;
@@ -96,7 +96,7 @@ public class TrieDateField extends DateF
 
   @Override
   public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) {
       writer.writeStr(name, TrieField.badFieldString(f),true);
       return;
@@ -136,7 +136,7 @@ public class TrieDateField extends DateF
 
   @Override
   public String toExternal(Fieldable f) {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) return TrieField.badFieldString(f);
      return super.toExternal(new Date(TrieField.toLong(arr)));
   }

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/schema/TrieField.java Sun Mar 14 20:58:32 2010
@@ -93,7 +93,7 @@ public class TrieField extends FieldType
 
   @Override
   public Object toObject(Fieldable f) {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) return badFieldString(f);
     switch (type) {
       case INTEGER:
@@ -145,7 +145,7 @@ public class TrieField extends FieldType
   }
 
   public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) {
       xmlWriter.writeStr(name, badFieldString(f));
       return;
@@ -173,7 +173,7 @@ public class TrieField extends FieldType
   }
 
   public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) {
       writer.writeStr(name, badFieldString(f),true);
       return;
@@ -352,7 +352,7 @@ public class TrieField extends FieldType
 
   @Override
   public String toExternal(Fieldable f) {
-    byte[] arr = f.binaryValue();
+    byte[] arr = f.getBinaryValue();
     if (arr==null) return badFieldString(f);
     switch (type) {
       case INTEGER:

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/search/DocSetHitCollector.java Sun Mar 14 20:58:32 2010
@@ -17,7 +17,6 @@
 
 package org.apache.solr.search;
 
-import org.apache.lucene.search.HitCollector;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.util.OpenBitSet;

Modified: lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java?rev=922957&r1=922956&r2=922957&view=diff
==============================================================================
--- lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java (original)
+++ lucene/solr/branches/solr/src/java/org/apache/solr/search/PrefixFilter.java Sun Mar 14 20:58:32 2010
@@ -25,7 +25,6 @@ import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.util.OpenBitSet;
 
-import java.util.BitSet;
 import java.io.IOException;
 
 /**
@@ -40,17 +39,6 @@ public class PrefixFilter extends Filter
 
   Term getPrefix() { return prefix; }
 
-  @Override
-  public BitSet bits(IndexReader reader) throws IOException {
-    final BitSet bitSet = new BitSet(reader.maxDoc());
-    new PrefixGenerator(prefix) {
-      public void handleDoc(int doc) {
-        bitSet.set(doc);
-      }
-    }.generate(reader);
-    return bitSet;
-  }
-
  @Override
   public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
     final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());