You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/25 21:32:53 UTC

svn commit: r1235919 [4/12] - in /lucene/dev/branches/lucene3661: ./ dev-tools/eclipse/ dev-tools/idea/lucene/contrib/ dev-tools/maven/ dev-tools/maven/solr/core/ dev-tools/maven/solr/solrj/ lucene/ lucene/contrib/ lucene/contrib/sandbox/src/test/org/a...

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html Wed Jan 25 20:32:44 2012
@@ -17,6 +17,42 @@
 -->
 <html><head></head>
 <body>
-Filters that normalize text before tokenization.
+<p>
+  Chainable filters that normalize text before tokenization and provide
+  mappings between normalized text offsets and the corresponding offset
+  in the original text.
+</p>
+<H2>CharFilter offset mappings</H2>
+<p>
+  CharFilters modify an input stream via a series of substring
+  replacements (including deletions and insertions) to produce an output
+  stream. There are three possible replacement cases: the replacement
+  string has the same length as the original substring; the replacement
+  is shorter; and the replacement is longer. In the latter two cases
+  (when the replacement has a different length than the original),
+  one or more offset correction mappings are required.
+</p>
+<p>
+  When the replacement is shorter than the original (e.g. when the
+  replacement is the empty string), a single offset correction mapping
+  should be added at the replacement's end offset in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string (a positive value).
+</p>
+<p>
+  When the replacement is longer than the original (e.g. when the
+  original is the empty string), you should add as many offset
+  correction mappings as the difference between the lengths of the
+  replacement string and the original substring, starting at the
+  end offset the original substring would have had in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string so far (a negative value).
+</p>
 </body>
 </html>

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Wed Jan 25 20:32:44 2012
@@ -154,13 +154,22 @@ public abstract class CompoundWordTokenF
 
     /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
     public CompoundToken(int offset, int length) {
-      final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
       this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
-      // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
-      // chars from the term, offsets may not match correctly (other filters producing tokens
-      // may also have this problem):
-      this.startOffset = newStart;
-      this.endOffset = newStart + length;
+      
+      // offsets of the original word
+      int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
+      int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
+      
+      if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
+        // if length by start + end offsets doesn't match the term text then assume
+        // this is a synonym and don't adjust the offsets.
+        this.startOffset = startOff;
+        this.endOffset = endOff;
+      } else {
+        final int newStart = startOff + offset;
+        this.startOffset = newStart;
+        this.endOffset = newStart + length;
+      }
     }
 
   }  

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java Wed Jan 25 20:32:44 2012
@@ -60,6 +60,7 @@ public final class HyphenatedWordsFilter
   private final StringBuilder hyphenated = new StringBuilder();
   private State savedState;
   private boolean exhausted = false;
+  private int lastEndOffset = 0;
 
   /**
    * Creates a new HyphenatedWordsFilter
@@ -78,6 +79,7 @@ public final class HyphenatedWordsFilter
     while (!exhausted && input.incrementToken()) {
       char[] term = termAttribute.buffer();
       int termLength = termAttribute.length();
+      lastEndOffset = offsetAttribute.endOffset();
       
       if (termLength > 0 && term[termLength - 1] == '-') {
         // a hyphenated word
@@ -119,6 +121,7 @@ public final class HyphenatedWordsFilter
     hyphenated.setLength(0);
     savedState = null;
     exhausted = false;
+    lastEndOffset = 0;
   }
 
   // ================================================= Helper Methods ================================================
@@ -127,8 +130,6 @@ public final class HyphenatedWordsFilter
    * Writes the joined unhyphenated term
    */
   private void unhyphenate() {
-    int endOffset = offsetAttribute.endOffset();
-    
     restoreState(savedState);
     savedState = null;
     
@@ -140,7 +141,7 @@ public final class HyphenatedWordsFilter
     
     hyphenated.getChars(0, length, term, 0);
     termAttribute.setLength(length);
-    offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
+    offsetAttribute.setOffset(offsetAttribute.startOffset(), lastEndOffset);
     hyphenated.setLength(0);
   }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java Wed Jan 25 20:32:44 2012
@@ -183,31 +183,33 @@ public final class PatternAnalyzer exten
    * 
    * @param fieldName
    *            the name of the field to tokenize (currently ignored).
+   * @param reader
+   *            reader (e.g. charfilter) of the original text. can be null.
    * @param text
    *            the string to tokenize
    * @return a new token stream
    */
-  public TokenStreamComponents createComponents(String fieldName, String text) {
+  public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
     // Ideally the Analyzer superclass should have a method with the same signature, 
     // with a default impl that simply delegates to the StringReader flavour. 
     if (text == null) 
       throw new IllegalArgumentException("text must not be null");
     
     if (pattern == NON_WORD_PATTERN) { // fast path
-      return new TokenStreamComponents(new FastStringTokenizer(text, true, toLowerCase, stopWords));
+      return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
     } else if (pattern == WHITESPACE_PATTERN) { // fast path
-      return new TokenStreamComponents(new FastStringTokenizer(text, false, toLowerCase, stopWords));
+      return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
     }
 
-    Tokenizer tokenizer = new PatternTokenizer(text, pattern, toLowerCase);
+    Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
     TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
     return new TokenStreamComponents(tokenizer, result);
   }
   
   /**
    * Creates a token stream that tokenizes all the text in the given Reader;
-   * This implementation forwards to <code>tokenStream(String, String)</code> and is
-   * less efficient than <code>tokenStream(String, String)</code>.
+   * This implementation forwards to <code>tokenStream(String, Reader, String)</code> and is
+   * less efficient than <code>tokenStream(String, Reader, String)</code>.
    * 
    * @param fieldName
    *            the name of the field to tokenize (currently ignored).
@@ -219,7 +221,7 @@ public final class PatternAnalyzer exten
   public TokenStreamComponents createComponents(String fieldName, Reader reader) {
     try {
       String text = toString(reader);
-      return createComponents(fieldName, text);
+      return createComponents(fieldName, reader, text);
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -332,7 +334,8 @@ public final class PatternAnalyzer exten
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
     
-    public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
+    public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
+      super(input);
       this.pattern = pattern;
       this.str = str;
       this.matcher = pattern.matcher(str);
@@ -359,7 +362,7 @@ public final class PatternAnalyzer exten
           String text = str.substring(start, end);
           if (toLowerCase) text = text.toLowerCase(locale);
           termAtt.setEmpty().append(text);
-          offsetAtt.setOffset(start, end);
+          offsetAtt.setOffset(correctOffset(start), correctOffset(end));
           return true;
         }
         if (!isMatch) return false;
@@ -369,7 +372,7 @@ public final class PatternAnalyzer exten
     @Override
     public final void end() {
       // set final offset
-      final int finalOffset = str.length();
+      final int finalOffset = correctOffset(str.length());
     	this.offsetAtt.setOffset(finalOffset, finalOffset);
     }
 
@@ -406,7 +409,8 @@ public final class PatternAnalyzer exten
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
     
-    public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
+    public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
+      super(input);
       this.str = str;
       this.isLetter = isLetter;
       this.toLowerCase = toLowerCase;
@@ -458,7 +462,7 @@ public final class PatternAnalyzer exten
         return false;
       }
       termAtt.setEmpty().append(text);
-      offsetAtt.setOffset(start, i);
+      offsetAtt.setOffset(correctOffset(start), correctOffset(i));
       return true;
     }
     
@@ -466,7 +470,7 @@ public final class PatternAnalyzer exten
     public final void end() {
       // set final offset
       final int finalOffset = str.length();
-      this.offsetAtt.setOffset(finalOffset, finalOffset);
+      this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
     }    
     
     private boolean isTokenChar(char c, boolean isLetter) {
@@ -479,6 +483,7 @@ public final class PatternAnalyzer exten
 
     @Override
     public void reset(Reader input) throws IOException {
+      super.reset(input);
       this.str = PatternAnalyzer.toString(input);
     }
 

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java Wed Jan 25 20:32:44 2012
@@ -68,7 +68,7 @@ public final class TrimFilter extends To
       } else {
         termAtt.setEmpty();
       }
-      if (updateOffsets) {
+      if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) {
         int newStart = offsetAtt.startOffset()+start;
         int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
         offsetAtt.setOffset(newStart, newEnd);

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java Wed Jan 25 20:32:44 2012
@@ -405,10 +405,20 @@ public final class WordDelimiterFilter e
     clearAttributes();
     termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
 
-    int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
-    int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
-
-    offsetAttribute.setOffset(startOffSet, endOffSet);
+    int startOffset = savedStartOffset + iterator.current;
+    int endOffset = savedStartOffset + iterator.end;
+    
+    if (hasIllegalOffsets) {
+      // historically this filter did this regardless for 'isSingleWord', 
+      // but we must do a sanity check:
+      if (isSingleWord && startOffset <= savedEndOffset) {
+        offsetAttribute.setOffset(startOffset, savedEndOffset);
+      } else {
+        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+      }
+    } else {
+      offsetAttribute.setOffset(startOffset, endOffset);
+    }
     posIncAttribute.setPositionIncrement(position(false));
     typeAttribute.setType(savedType);
   }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Wed Jan 25 20:32:44 2012
@@ -74,7 +74,8 @@ public final class EdgeNGramTokenizer ex
   private int gramSize;
   private Side side;
   private boolean started = false;
-  private int inLen;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
   private String inStr;
 
 
@@ -183,7 +184,11 @@ public final class EdgeNGramTokenizer ex
     if (!started) {
       started = true;
       char[] chars = new char[1024];
-      int charsRead = input.read(chars);
+      charsRead = input.read(chars);
+      if (charsRead < 0) {
+        charsRead = inLen = 0;
+        return false;
+      }
       inStr = new String(chars, 0, charsRead).trim();  // remove any leading or trailing spaces
       inLen = inStr.length();
       gramSize = minGram;
@@ -211,7 +216,7 @@ public final class EdgeNGramTokenizer ex
   @Override
   public final void end() {
     // set final offset
-    final int finalOffset = inLen;
+    final int finalOffset = correctOffset(charsRead);
     this.offsetAtt.setOffset(finalOffset, finalOffset);
   }    
 
@@ -225,5 +230,6 @@ public final class EdgeNGramTokenizer ex
   public void reset() throws IOException {
     super.reset();
     started = false;
+    charsRead = 0;
   }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Wed Jan 25 20:32:44 2012
@@ -35,7 +35,8 @@ public final class NGramTokenizer extend
   private int minGram, maxGram;
   private int gramSize;
   private int pos = 0;
-  private int inLen;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
   private String inStr;
   private boolean started = false;
   
@@ -104,7 +105,11 @@ public final class NGramTokenizer extend
       started = true;
       gramSize = minGram;
       char[] chars = new char[1024];
-      input.read(chars);
+      charsRead = input.read(chars);
+      if (charsRead < 0) {
+        charsRead = inLen = 0;
+        return false;
+      }
       inStr = new String(chars).trim();  // remove any trailing empty strings 
       inLen = inStr.length();
     }
@@ -128,7 +133,7 @@ public final class NGramTokenizer extend
   @Override
   public final void end() {
     // set final offset
-    final int finalOffset = inLen;
+    final int finalOffset = correctOffset(charsRead);
     this.offsetAtt.setOffset(finalOffset, finalOffset);
   }    
   
@@ -143,5 +148,6 @@ public final class NGramTokenizer extend
     super.reset();
     started = false;
     pos = 0;
+    charsRead = 0;
   }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java Wed Jan 25 20:32:44 2012
@@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokena
 public final class PositionFilter extends TokenFilter {
 
   /** Position increment to assign to all but the first token - default = 0 */
-  private int positionIncrement = 0;
+  private final int positionIncrement;
   
   /** The first token must have non-zero positionIncrement **/
   private boolean firstTokenPositioned = false;
@@ -44,7 +44,7 @@ public final class PositionFilter extend
    * @param input the input stream
    */
   public PositionFilter(final TokenStream input) {
-    super(input);
+    this(input, 0);
   }
 
   /**
@@ -56,7 +56,7 @@ public final class PositionFilter extend
    *  token from the input stream
    */
   public PositionFilter(final TokenStream input, final int positionIncrement) {
-    this(input);
+    super(input);
     this.positionIncrement = positionIncrement;
   }
 

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Wed Jan 25 20:32:44 2012
@@ -68,6 +68,7 @@ public final class ThaiWordFilter extend
   private CharTermAttribute clonedTermAtt = null;
   private OffsetAttribute clonedOffsetAtt = null;
   private boolean hasMoreTokensInClone = false;
+  private boolean hasIllegalOffsets = false; // only if the length changed before this filter
 
   /** Creates a new ThaiWordFilter with the specified match version. */
   public ThaiWordFilter(Version matchVersion, TokenStream input) {
@@ -86,7 +87,11 @@ public final class ThaiWordFilter extend
       if (end != BreakIterator.DONE) {
         clonedToken.copyTo(this);
         termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
-        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        if (hasIllegalOffsets) {
+          offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+        } else {
+          offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        }
         if (handlePosIncr) posAtt.setPositionIncrement(1);
         return true;
       }
@@ -102,6 +107,10 @@ public final class ThaiWordFilter extend
     }
     
     hasMoreTokensInClone = true;
+    
+    // if length by start + end offsets doesn't match the term text then assume
+    // this is a synonym and don't adjust the offsets.
+    hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();
 
     // we lazy init the cloned token, as in ctor not all attributes may be added
     if (clonedToken == null) {
@@ -118,7 +127,11 @@ public final class ThaiWordFilter extend
     int end = breaker.next();
     if (end != BreakIterator.DONE) {
       termAtt.setLength(end);
-      offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      if (hasIllegalOffsets) {
+        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+      } else {
+        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      }
       // position increment keeps as it is for first token
       return true;
     }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java Wed Jan 25 20:32:44 2012
@@ -306,13 +306,14 @@ public final class WikipediaTokenizer ex
   @Override
   public void reset() throws IOException {
     super.reset();
-    scanner.yyreset(input);
+    tokens = null;
+    scanner.reset();
   }
 
   @Override
   public void reset(Reader reader) throws IOException {
     super.reset(reader);
-    reset();
+    scanner.yyreset(input);
   }
 
   @Override

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Wed Jan 25 20:32:44 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:11 PM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */
 
 package org.apache.lucene.analysis.wikipedia;
 
@@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokena
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 9/30/11 12:11 PM from the specification file
- * <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * on 1/22/12 10:26 PM from the specification file
+ * <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
  */
 class WikipediaTokenizerImpl {
 
@@ -498,6 +498,14 @@ final int setText(StringBuilder buffer){
   return length;
 }
 
+final void reset() {
+  currentTokType = 0;
+  numBalanced = 0;
+  positionInc = 1;
+  numLinkToks = 0;
+  numWikiTokensSeen = 0;
+}
+
 
 
 

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex Wed Jan 25 20:32:44 2012
@@ -91,6 +91,14 @@ final int setText(StringBuilder buffer){
   return length;
 }
 
+final void reset() {
+  currentTokType = 0;
+  numBalanced = 0;
+  positionInc = 1;
+  numLinkToks = 0;
+  numWikiTokensSeen = 0;
+}
+
 
 %}
 

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Wed Jan 25 20:32:44 2012
@@ -23,6 +23,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTo
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
-import org.junit.Ignore;
+import org.apache.lucene.util._TestUtil;
 
 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
 
@@ -41,9 +42,9 @@ public class HTMLStripCharFilterTest ext
     String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
             "another <a href=\"http://lucene.apache.org/\">link</a>. " +
             "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
-    String gold = " this is some text  here is a  link  and " +
-            "another  link . " +
-            "This is an entity: & plus a <.  Here is an &.  ";
+    String gold = "\nthis is some text\n here is a link and " +
+            "another link. " +
+            "This is an entity: & plus a <.  Here is an &. ";
     HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
     StringBuilder builder = new StringBuilder();
     int ch = -1;
@@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest ext
               + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
       position++;
     }
-    assertEquals(gold, builder.toString());
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
   }
 
   //Some sanity checks, but not a full-fledged check
@@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest ext
     
   }
 
+  public void testMSWord14GeneratedHTML() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+    String gold = "This is a test";
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString().trim());
+  }
+  
+  
   public void testGamma() throws Exception {
     String test = "&Gamma;";
     String gold = "\u0393";
@@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest ext
       builder.append((char)ch);
     }
     String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
   }
 
   public void testEntities() throws Exception {
@@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest ext
       builder.append((char)ch);
     }
     String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
   }
 
   public void testMoreEntities() throws Exception {
@@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest ext
       builder.append((char)ch);
     }
     String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
   }
 
   public void testReserved() throws Exception {
@@ -147,45 +161,248 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testMalformedHTML() throws Exception {
-    String test = "a <a hr<ef=aa<a>> </close</a>";
-    String gold = "a <a hr<ef=aa > </close ";
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
-    StringBuilder builder = new StringBuilder();
-    int ch = 0;
-    while ((ch = reader.read()) != -1){
-      builder.append((char)ch);
+    String[] testGold = {
+        "a <a hr<ef=aa<a>> </close</a>",
+        "a <a hr<ef=aa> </close",
+
+        "<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
+        "Submit a Site",
+
+        "<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
+        "Christian Science",
+
+        "<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
+        "\n",
+
+        // "<" before ">" inhibits tag recognition
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+
+        "<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
+        "",
+
+        "<link title=\"^\\\" 21Sta's Blog\" rel=\"search\"  type=\"application/opensearchdescription+xml\"  href=\"http://21sta.com/blog/inc/opensearch.php\" />",
+        "\n",
+
+        "<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
+        "?",
+
+        "<a href='/modern-furniture'   ' id='21txt' class='offtab'   onMouseout=\"this.className='offtab';  return true;\" onMouseover=\"this.className='ontab';  return true;\">",
+        "",
+
+        "<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
+        "",
+
+        "The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
+        "The <a href=medical\">http://www.advancedmd.com>medical practice software",
+
+        "<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
+        "Levi.com/BMX 2008 Clip of the Week 29...",
+
+        "<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
+        "Printer Friendly",
+
+        "<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
+        "Add to Favorites",
+
+        "<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
+        "At",
+
+        "E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
+        "E-mail: XXXXXX@example.com ",
+
+        "<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
+        "\nA'13?\n",
+
+        "<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
+        "\nHubert \"Geese\" Ausby\n",
+
+        "<href=\"http://anbportal.com/mms/login.asp\">",
+        "\n",
+
+        "<a href=\"",
+        "<a href=\"",
+
+        "<a href=\">",
+        "",
+
+        "<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
+        "#",
+
+        "<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
+        "",
+
+        "<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want  add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
+        "",
+
+        "<a href=#Services & Support>",
+        "",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' +  document.getElementById('advancedlink').style.display ;  document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
+        "",
+
+        "<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\"  hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
+        "",
+
+        "<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
+        "\n",
+
+        "<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
+        "#",
+
+        "<a href=  >",
+        "",
+
+        "<ahref=http:..",
+        "<ahref=http:..",
+
+        "<ahref=http:..>",
+        "\n",
+
+        "<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
+        "\nA",
+
+        "<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
+        "",
+
+        "<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
+        "",
+
+        "<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
+        "",
+
+        "<a class=\"at\" name=\"Lamborghini  href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
+        "Lamborghini /a>",
+
+        "<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
+        "",
+
+        "<a href=/myspace !style='color:#993333'>",
+        "",
+
+        "<meta name=3DProgId content=3DExcel.Sheet>",
+        "\n",
+
+        "<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
+        "\n",
+
+        "<td bgcolor=3D\"#FFFFFF\" nowrap>",
+        "\n",
+
+        "<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
+        "\"predicciones mundiales 2009\"",
+
+        "<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
+        "",
+
+        "<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
+        "Bishop\"",
+
+        "<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 &amp; 5 miles CC combined start</a>",
+        "BHAA Eircom 2 & 5 miles CC combined start",
+
+        "<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
+        "",
+
+        "<a  href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
+        "",
+
+        // "<" before ">" inhibits tag recognition
+        "<input type=\"text\" value=\"<search here>\">",
+        "<input type=\"text\" value=\"\n\">",
+
+        "<input type=\"text\" value=\"<search here\">",
+        "<input type=\"text\" value=\"\n",
+
+        "<input type=\"text\" value=\"search here>\">",
+        "\">",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
+        "",
+
+        "<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
+        "\n\n\n",
+
+        "<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
+        "\n\n\n\n\n\n\n\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
     }
-    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
   }
 
+
   public void testBufferOverflow() throws Exception {
-    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
     testBuilder.append("ah<?> ??????");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
 
     testBuilder.setLength(0);
     testBuilder.append("<!--");//comments
-    appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+    appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
 
     testBuilder.append("-->foo");
-    processBuffer(testBuilder.toString(), "Failed w/ comment");
+    String gold = "foo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
 
     testBuilder.setLength(0);
     testBuilder.append("<?");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     testBuilder.append("?>");
-    processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
     
     testBuilder.setLength(0);
     testBuilder.append("<b ");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     testBuilder.append("/>");
-    processBuffer(testBuilder.toString(), "Failed on tag");
-
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
   }
 
   private void appendChars(StringBuilder testBuilder, int numChars) {
@@ -208,13 +425,14 @@ public class HTMLStripCharFilterTest ext
     } finally {
       // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
     }
-    assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+    assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
+        test, builder.toString());
   }
 
   public void testComment() throws Exception {
 
     String test = "<!--- three dashes, still a valid comment ---> ";
-    String gold = "  ";
+    String gold = " ";
     Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
     int ch = 0;
     StringBuilder builder = new StringBuilder();
@@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest ext
     } finally {
       // System.out.println("String: " + builder.toString());
     }
-    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
   }
 
 
@@ -247,15 +466,32 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testOffsets() throws Exception {
-    doTestOffsets("hello X how X are you");
+//    doTestOffsets("hello X how X are you");
     doTestOffsets("hello <p> X<p> how <p>X are you");
     doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
 
     // test backtracking
     doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
   }
-  
-  @Ignore("broken offsets: see LUCENE-2208")
+
+  static void assertLegalOffsets(String in) throws Exception {
+    int length = in.length();
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+    int ch = 0;
+    int off = 0;
+    while ((ch = reader.read()) != -1) {
+      int correction = reader.correctOffset(off);
+      assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
+          correction <= length);
+      off++;
+    }
+  }
+
+  public void testLegalOffsets() throws Exception {
+    assertLegalOffsets("hello world");
+    assertLegalOffsets("hello &#x world");
+  }
+
   public void testRandom() throws Exception {
     Analyzer analyzer = new Analyzer() {
 
@@ -267,11 +503,361 @@ public class HTMLStripCharFilterTest ext
 
       @Override
       protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+        return new HTMLStripCharFilter(CharReader.get(reader));
       }
     };
     
     int numRounds = RANDOM_MULTIPLIER * 10000;
     checkRandomData(random, analyzer, numRounds);
   }
+  
+  public void testServerSideIncludes() throws Exception {
+    String test = "one<img src=\"image.png\"\n"
+        + " alt =  \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}'  -->\"\n\n"
+        + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
+
+    test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
+    gold = "one\ntwo";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testScriptQuotes() throws Exception {
+    String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+
+    test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
+    gold = "hello\n";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testEscapeScript() throws Exception {
+    String test = "one<script no-value-attr>callSomeMethod();</script>two";
+    String gold = "one<script no-value-attr></script>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testStyle() throws Exception {
+    String test = "one<style type=\"text/css\">\n"
+                + "<!--\n"
+                + "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+                + "-->\n"
+                + "</style>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testEscapeStyle() throws Exception {
+    String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
+    String gold = "one<style type=\"text/css\"></style>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testBR() throws Exception {
+    String[] testGold = {
+        "one<BR />two<br>three",
+        "one\ntwo\nthree",
+
+        "one<BR some stuff here too>two</BR>",
+        "one\ntwo\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
+    }
+  }
+  public void testEscapeBR() throws Exception {
+    String test = "one<BR class='whatever'>two</\nBR\n>";
+    String gold = "one<BR class='whatever'>two</\nBR\n>";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testInlineTagsNoSpace() throws Exception {
+    String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
+    String gold = "onetwo2e.three";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testCDATA() throws Exception {
+    String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
+    String gold = "one<one><two>three<four></four></two></one>two";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+
+    test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
+    gold = "onetwo<![CDATA[three]]>fourfive";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testUppercaseCharacterEntityVariants() throws Exception {
+    String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
+    String gold = " \"-\u00A9>><<\u00AE&";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testMSWordMalformedProcessingInstruction() throws Exception {
+    String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testSupplementaryCharsInTags() throws Exception {
+    String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
+    String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testRandomBrokenHTML() throws Exception {
+    int maxNumElements = 10000;
+    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text)));
+    while (reader.read() != -1);
+  }
+
+  public void testRandomText() throws Exception {
+    StringBuilder text = new StringBuilder();
+    int minNumWords = 10;
+    int maxNumWords = 10000;
+    int minWordLength = 3;
+    int maxWordLength = 20;
+    int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+    switch (_TestUtil.nextInt(random, 0, 4)) {
+      case 0: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      case 1: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomRealisticUnicodeString
+              (random, minWordLength, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      default: { // ASCII 50% of the time
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomSimpleString(random));
+          text.append(' ');
+        }
+      }
+    }
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text.toString())));
+    while (reader.read() != -1);
+  }
+
+  public void testUTF16Surrogates() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+      }
+    };
+    // Paired surrogates
+    assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
+        new String[] { "one", "two", "\uD86C\uDC01three" } );
+    assertAnalyzesTo(analyzer, " &#55404;&#XdC01;", new String[] { "\uD86C\uDC01" } );
+    assertAnalyzesTo(analyzer, " &#xD86C;&#56321;", new String[] { "\uD86C\uDC01" } );
+    assertAnalyzesTo(analyzer, " &#55404;&#56321;", new String[] { "\uD86C\uDC01" } );
+
+    // Improperly paired surrogates
+    assertAnalyzesTo(analyzer, " &#55404;&#57999;", new String[] { "\uFFFD\uE28F" } );
+    assertAnalyzesTo(analyzer, " &#xD86C;&#57999;", new String[] { "\uFFFD\uE28F" } );
+    assertAnalyzesTo(analyzer, " &#55002;&#XdC01;", new String[] { "\uD6DA\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55002;&#56321;", new String[] { "\uD6DA\uFFFD" } );
+
+    // Unpaired high surrogates
+    assertAnalyzesTo(analyzer, " &#Xd921;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#Xd921", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#Xd921<br>", new String[] { "&#Xd921" } );
+    assertAnalyzesTo(analyzer, " &#55528;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55528", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55528<br>", new String[] { "&#55528" } );
+
+    // Unpaired low surrogates
+    assertAnalyzesTo(analyzer, " &#xdfdb;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#xdfdb", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#xdfdb<br>", new String[] { "&#xdfdb" } );
+    assertAnalyzesTo(analyzer, " &#57209;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java Wed Jan 25 20:32:44 2012
@@ -117,5 +117,10 @@ public class TestChineseTokenizer extend
       assertAnalyzesTo(justFilter, "This is a Test. b c d", 
           new String[] { "This", "Test." });
     }
+    
+    /** blast some random strings through the analyzer */
+    public void testRandomStrings() throws Exception {
+      checkRandomData(random, new ChineseAnalyzer(), 10000*RANDOM_MULTIPLIER);
+    }
 
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java Wed Jan 25 20:32:44 2012
@@ -306,4 +306,31 @@ public class CommonGramsFilterTest exten
     TokenFilter nsf = new CommonGramsQueryFilter(cgf);
     assertTokenStreamContents(nsf, new String[] { "the_of" });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
+        return new TokenStreamComponents(t, cgf);
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
+        return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf));
+      }
+    };
+    
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Wed Jan 25 20:32:44 2012
@@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compo
  */
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
@@ -299,5 +304,61 @@ public class TestCompoundWordTokenFilter
       }
     }
   }
+  
+  // SOLR-2891
+  // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
+  // wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
+  // so in this case we behave like WDF, and preserve any modified offsets
+  public void testInvalidOffsets() throws Exception {
+    final CharArraySet dict = makeDictionary("fall");
+    final NormalizeCharMap normMap = new NormalizeCharMap();
+    normMap.add("ü", "ue");
+    
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MappingCharFilter(normMap, CharReader.get(reader));
+      }
+    };
 
+    assertAnalyzesTo(analyzer, "banküberfall", 
+        new String[] { "bankueberfall", "fall" },
+        new int[] { 0,  0 },
+        new int[] { 12, 12 });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
+    final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java Wed Jan 25 20:32:44 2012
@@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hunsp
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;
 import java.io.StringReader;
 import java.text.ParseException;
 import java.util.Arrays;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.junit.BeforeClass;
@@ -57,4 +60,17 @@ public class HunspellStemFilterTest  ext
     filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
     assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
+      }  
+    };
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java Wed Jan 25 20:32:44 2012
@@ -22,6 +22,7 @@ import java.io.StringReader;
 import java.util.Arrays;
 import java.util.regex.Pattern;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.StopAnalyzer;
@@ -132,4 +133,10 @@ public class PatternAnalyzerTest extends
     TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
     assertTokenStreamContents(ts2, expected);
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java Wed Jan 25 20:32:44 2012
@@ -17,11 +17,14 @@ package org.apache.lucene.analysis.misce
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.List;
 import java.util.ArrayList;
@@ -1907,4 +1910,17 @@ public class TestASCIIFoldingFilter exte
     assertTrue(stream.incrementToken());
     assertEquals(expected, termAtt.toString());
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
+      } 
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java Wed Jan 25 20:32:44 2012
@@ -18,12 +18,14 @@
 package org.apache.lucene.analysis.miscellaneous;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
@@ -117,4 +119,18 @@ public class TestCapitalizationFilter ex
         new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
         minWordLength, maxWordCount, maxTokenLength);    
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomString() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java Wed Jan 25 20:32:44 2012
@@ -17,11 +17,14 @@
 
 package org.apache.lucene.analysis.miscellaneous;
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 
 /**
  * HyphenatedWordsFilter test
@@ -46,5 +49,29 @@ public class TestHyphenatedWordsFilter e
 	    ts = new HyphenatedWordsFilter(ts);
 	    assertTokenStreamContents(ts, 
 	        new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
-	  }
+	}
+	
+	public void testOffsets() throws Exception {
+	  String input = "abc- def geh 1234- 5678-";
+    TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
+    ts = new HyphenatedWordsFilter(ts);
+    assertTokenStreamContents(ts, 
+        new String[] { "abcdef", "geh", "12345678-" },
+        new int[] { 0, 9, 13 },
+        new int[] { 8, 12, 24 });
+	}
+	
+  /** blast some random strings through the analyzer */
+  public void testRandomString() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java Wed Jan 25 20:32:44 2012
@@ -17,13 +17,16 @@
 
 package org.apache.lucene.analysis.miscellaneous;
 
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashSet;
 import java.util.Set;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
 
 /** Test {@link KeepWordFilter} */
@@ -57,4 +60,23 @@ public class TestKeepWordFilter extends 
     stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
     assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final Set<String> words = new HashSet<String>();
+    words.add( "a" );
+    words.add( "b" );
+    
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java Wed Jan 25 20:32:44 2012
@@ -17,13 +17,21 @@
 
 package org.apache.lucene.analysis.miscellaneous;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util._TestUtil;
 
+import java.io.Reader;
 import java.util.Iterator;
 import java.util.Arrays;
 
@@ -116,6 +124,45 @@ public class TestRemoveDuplicatesTokenFi
              
   }
   
+  // some helper methods for the below test with synonyms
+  private String randomNonEmptyString() {
+    while(true) {
+      final String s = _TestUtil.randomUnicodeString(random).trim();
+      if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+        return s;
+      }
+    }
+  }
+  
+  private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
+    b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+          new CharsRef(output.replaceAll(" +", "\u0000")),
+          keepOrig);
+  }
   
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final int numIters = atLeast(10);
+    for (int i = 0; i < numIters; i++) {
+      SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
+      final int numEntries = atLeast(10);
+      for (int j = 0; j < numEntries; j++) {
+        add(b, randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
+      }
+      final SynonymMap map = b.build();
+      final boolean ignoreCase = random.nextBoolean();
+      
+      final Analyzer analyzer = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+          TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase);
+          return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
+        }
+      };
+
+      checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
+    }
+  }
 
 }

Modified: lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java?rev=1235919&r1=1235918&r2=1235919&view=diff
==============================================================================
--- lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (original)
+++ lucene/dev/branches/lucene3661/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java Wed Jan 25 20:32:44 2012
@@ -18,11 +18,15 @@
 package org.apache.lucene.analysis.miscellaneous;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.util.Collection;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.*;
 
 /**
@@ -103,4 +107,27 @@ public class TestTrimFilter extends Base
       }
     }
   }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+        return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
+      } 
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+        return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
+      } 
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }