You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/04/10 18:54:55 UTC

svn commit: r1311864 - in /lucene/dev/branches/lucene3969: lucene/test-framework/src/java/org/apache/lucene/analysis/ modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/ modules/analysis/common/src/test/org/apache/lucene/analysis/co...

Author: mikemccand
Date: Tue Apr 10 16:54:54 2012
New Revision: 1311864

URL: http://svn.apache.org/viewvc?rev=1311864&view=rev
Log:
LUCENE-3969: make full offset checking optional and disable for the known (buggy) offenders

Modified:
    lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java

Modified: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Apr 10 16:54:54 2012
@@ -100,7 +100,14 @@ public abstract class BaseTokenStreamTes
     }
   }
 
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
+  // offsetsAreCorrect also validates:
+  //   - graph offsets are correct (all tokens leaving from
+  //     pos X have the same startOffset; all tokens
+  //     arriving to pos Y have the same endOffset)
+  //   - offsets only move forwards (startOffset >=
+  //     lastStartOffset)
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
+                                               boolean offsetsAreCorrect) throws IOException {
     assertNotNull(output);
     CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
     
@@ -137,6 +144,7 @@ public abstract class BaseTokenStreamTes
 
     ts.reset();
     int pos = -1;
+    int lastStartOffset = 0;
     for (int i = 0; i < output.length; i++) {
       // extra safety to enforce, that the state is not preserved and also assign bogus values
       ts.clearAttributes();
@@ -176,7 +184,12 @@ public abstract class BaseTokenStreamTes
                      endOffset <= finalOffset.intValue());
         }
 
-        if (posLengthAtt != null && posIncrAtt != null) {
+        if (offsetsAreCorrect) {
+          assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset);
+          lastStartOffset = offsetAtt.startOffset();
+        }
+
+        if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
           // Validate offset consistency in the graph, ie
           // all tokens leaving from a certain pos have the
           // same startOffset, and all tokens arriving to a
@@ -233,6 +246,10 @@ public abstract class BaseTokenStreamTes
     ts.close();
   }
   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
+  }
+
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
     assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
   }
@@ -280,6 +297,10 @@ public abstract class BaseTokenStreamTes
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
     assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
   }
+
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
+  }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
     assertAnalyzesTo(a, input, output, null, null, null, null, null);
@@ -342,12 +363,12 @@ public abstract class BaseTokenStreamTes
   
   /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
   public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
-    checkRandomData(random, a, iterations, 20, false);
+    checkRandomData(random, a, iterations, 20, false, true);
   }
-  
+
   /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
   public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
-    checkRandomData(random, a, iterations, maxWordLength, false);
+    checkRandomData(random, a, iterations, maxWordLength, false, true);
   }
   
   /** 
@@ -355,7 +376,7 @@ public abstract class BaseTokenStreamTes
    * @param simple true if only ascii strings will be used (try to avoid)
    */
   public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
-    checkRandomData(random, a, iterations, 20, simple);
+    checkRandomData(random, a, iterations, 20, simple, true);
   }
   
   static class AnalysisThread extends Thread {
@@ -364,13 +385,15 @@ public abstract class BaseTokenStreamTes
     final Random random;
     final Analyzer a;
     final boolean simple;
+    final boolean offsetsAreCorrect;
     
-    AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) {
+    AnalysisThread(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) {
       this.random = random;
       this.a = a;
       this.iterations = iterations;
       this.maxWordLength = maxWordLength;
       this.simple = simple;
+      this.offsetsAreCorrect = offsetsAreCorrect;
     }
     
     @Override
@@ -378,7 +401,7 @@ public abstract class BaseTokenStreamTes
       try {
         // see the part in checkRandomData where it replays the same text again
         // to verify reproducability/reuse: hopefully this would catch thread hazards.
-        checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
+        checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
       } catch (IOException e) {
         Rethrow.rethrow(e);
       }
@@ -386,12 +409,16 @@ public abstract class BaseTokenStreamTes
   };
   
   public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
-    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
+    checkRandomData(random, a, iterations, maxWordLength, simple, true);
+  }
+
+  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
+    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple, offsetsAreCorrect);
     // now test with multiple threads
     int numThreads = _TestUtil.nextInt(random, 4, 8);
     Thread threads[] = new Thread[numThreads];
     for (int i = 0; i < threads.length; i++) {
-      threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple);
+      threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, maxWordLength, simple, offsetsAreCorrect);
     }
     for (int i = 0; i < threads.length; i++) {
       threads[i].start();
@@ -405,7 +432,7 @@ public abstract class BaseTokenStreamTes
     }
   }
 
-  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
+  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
 
     final LineFileDocs docs = new LineFileDocs(random);
 
@@ -437,7 +464,7 @@ public abstract class BaseTokenStreamTes
       }
 
       try {
-        checkAnalysisConsistency(random, a, useCharFilter, text);
+        checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
       } catch (Throwable t) {
         // TODO: really we should pass a random seed to
         // checkAnalysisConsistency then print it here too:
@@ -477,6 +504,10 @@ public abstract class BaseTokenStreamTes
   }
 
   public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
+    checkAnalysisConsistency(random, a, useCharFilter, text, true);
+  }
+
+  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
 
     if (VERBOSE) {
       System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@@ -616,7 +647,8 @@ public abstract class BaseTokenStreamTes
                                 types.toArray(new String[types.size()]),
                                 toIntArray(positions),
                                 toIntArray(positionLengths),
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
       // offset + pos + type
       assertTokenStreamContents(ts, 
@@ -626,7 +658,8 @@ public abstract class BaseTokenStreamTes
                                 types.toArray(new String[types.size()]),
                                 toIntArray(positions),
                                 null,
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
       // offset + pos + posLength
       assertTokenStreamContents(ts, 
@@ -636,7 +669,8 @@ public abstract class BaseTokenStreamTes
                                 null,
                                 toIntArray(positions),
                                 toIntArray(positionLengths),
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (posIncAtt != null && offsetAtt != null) {
       // offset + pos
       assertTokenStreamContents(ts, 
@@ -646,7 +680,8 @@ public abstract class BaseTokenStreamTes
                                 null,
                                 toIntArray(positions),
                                 null,
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else if (offsetAtt != null) {
       // offset
       assertTokenStreamContents(ts, 
@@ -656,7 +691,8 @@ public abstract class BaseTokenStreamTes
                                 null,
                                 null,
                                 null,
-                                text.length());
+                                text.length(),
+                                offsetsAreCorrect);
     } else {
       // terms only
       assertTokenStreamContents(ts, 

Modified: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java (original)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java Tue Apr 10 16:54:54 2012
@@ -27,7 +27,11 @@ import org.apache.lucene.analysis.tokena
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.Attribute;
 
-// nocommit better name...?
+// nocommit rename to OffsetsXXXTF?  ie we only validate
+// offsets (now anyway...)
+
+// TODO: also make a DebuggingTokenFilter, that just prints
+// all att values that come through it...
 
 // nocommit BTSTC should just append this to the chain
 // instead of checking itself:
@@ -37,6 +41,7 @@ import org.apache.lucene.util.Attribute;
 public final class ValidatingTokenFilter extends TokenFilter {
 
   private int pos;
+  private int lastStartOffset;
 
   // Maps position to the start/end offset:
   private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
@@ -46,6 +51,7 @@ public final class ValidatingTokenFilter
   private final PositionLengthAttribute posLenAtt = getAttrIfExists(PositionLengthAttribute.class);
   private final OffsetAttribute offsetAtt = getAttrIfExists(OffsetAttribute.class);
   private final CharTermAttribute termAtt = getAttrIfExists(CharTermAttribute.class);
+  private final boolean offsetsAreCorrect;
 
   private final String name;
 
@@ -61,9 +67,10 @@ public final class ValidatingTokenFilter
   /** The name arg is used to identify this stage when
    *  throwing exceptions (useful if you have more than one
    *  instance in your chain). */
-  public ValidatingTokenFilter(TokenStream in, String name) {
+  public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
     super(in);
     this.name = name;
+    this.offsetsAreCorrect = offsetsAreCorrect;
   }
 
   @Override
@@ -82,6 +89,8 @@ public final class ValidatingTokenFilter
         throw new IllegalStateException("first posInc must be > 0");
       }
     }
+
+    // System.out.println("  got token=" + termAtt + " pos=" + pos);
     
     if (offsetAtt != null) {
       startOffset = offsetAtt.startOffset();
@@ -96,11 +105,15 @@ public final class ValidatingTokenFilter
       if (endOffset < startOffset) {
         throw new IllegalStateException(name + ": startOffset=" + startOffset + " is > endOffset=" + endOffset + " pos=" + pos + "; token=" + termAtt);
       }
+      if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
+        throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
+      }
+      lastStartOffset = offsetAtt.startOffset();
     }
     
     posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
     
-    if (offsetAtt != null && posIncAtt != null) {
+    if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
 
       if (!posToStartOffset.containsKey(pos)) {
         // First time we've seen a token leaving from this position:
@@ -152,5 +165,6 @@ public final class ValidatingTokenFilter
     pos = -1;
     posToStartOffset.clear();
     posToEndOffset.clear();
+    lastStartOffset = 0;
   }
 }

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java Tue Apr 10 16:54:54 2012
@@ -30,6 +30,7 @@ import org.apache.lucene.analysis.MockTo
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util._TestUtil;
+import org.junit.Ignore;
 
 public class TestMappingCharFilter extends BaseTokenStreamTestCase {
 
@@ -195,6 +196,7 @@ public class TestMappingCharFilter exten
   }
   
   // nocommit: wrong final offset, fix this!
+  @Ignore
   public void testFinalOffsetSpecialCase() throws Exception {  
     final NormalizeCharMap map = new NormalizeCharMap();
     map.add("t", "");
@@ -219,6 +221,7 @@ public class TestMappingCharFilter exten
   }
   
   // nocommit: this is intended to fail until we fix bugs
+  @Ignore
   public void testRandomMaps() throws Exception {
     for (int i = 0; i < 100; i++) {
       final NormalizeCharMap map = randomMap();

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Tue Apr 10 16:54:54 2012
@@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTo
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
@@ -63,6 +64,8 @@ import org.apache.lucene.analysis.compou
 import org.apache.lucene.analysis.hunspell.HunspellDictionary;
 import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.TrimFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
@@ -91,42 +94,54 @@ import org.xml.sax.InputSource;
 
 /** tests random analysis chains */
 public class TestRandomChains extends BaseTokenStreamTestCase {
+
   static List<Constructor<? extends Tokenizer>> tokenizers;
   static List<Constructor<? extends TokenFilter>> tokenfilters;
   static List<Constructor<? extends CharStream>> charfilters;
-  
+
   // TODO: fix those and remove
   private static final Set<Class<?>> brokenComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
   static {
+    // nocommit can we promote some of these to be only
+    // offsets offenders?
     Collections.<Class<?>>addAll(brokenComponents,
-      // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
-      EmptyTokenizer.class,
-      // doesn't actual reset itself!
-      CachingTokenFilter.class,
-      // nocommit: corrumpts graphs (offset consistency check)
-      PositionFilter.class,
-      // doesn't consume whole stream!
-      LimitTokenCountFilter.class,
-      // broken!
-      NGramTokenizer.class,
-      // broken!
-      NGramTokenFilter.class,
-      // broken!
-      EdgeNGramTokenizer.class,
-      // broken!
-      EdgeNGramTokenFilter.class,
-      // fix these 4 to use 'real positions' and not stack the way they do:
-      // if you want that use positionfilter
-      PathHierarchyTokenizer.class,
-      ReversePathHierarchyTokenizer.class,
-      HyphenationCompoundWordTokenFilter.class,
-      DictionaryCompoundWordTokenFilter.class,
-      // Not broken: we forcefully add this, so we shouldn't
-      // also randomly pick it:
-      ValidatingTokenFilter.class
+                                 // TODO: fix basetokenstreamtestcase not to trip because this one has no CharTermAtt
+                                 EmptyTokenizer.class,
+                                 // doesn't actual reset itself!
+                                 CachingTokenFilter.class,
+                                 // doesn't consume whole stream!
+                                 LimitTokenCountFilter.class,
+                                 // Not broken: we forcefully add this, so we shouldn't
+                                 // also randomly pick it:
+                                 ValidatingTokenFilter.class,
+                                 // nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
+                                 EdgeNGramTokenizer.class,
+                                 // nocommit: randomly generate the Side enum param here; then promote to brokenOffsets?
+                                 EdgeNGramTokenFilter.class
     );
   }
-  
+
+  // TODO: also fix these and remove (maybe):
+  // Classes that don't produce consistent graph offsets:
+  private static final Set<Class<?>> brokenOffsetsComponents = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
+  static {
+    Collections.<Class<?>>addAll(brokenOffsetsComponents,
+                                 WordDelimiterFilter.class,
+                                 TrimFilter.class,
+                                 ReversePathHierarchyTokenizer.class,
+                                 PathHierarchyTokenizer.class,
+                                 HyphenationCompoundWordTokenFilter.class,
+                                 DictionaryCompoundWordTokenFilter.class,
+                                 // nocommit: corrumpts graphs (offset consistency check):
+                                 PositionFilter.class,
+                                 // broken!
+                                 NGramTokenizer.class,
+                                 // broken!
+                                 NGramTokenFilter.class,
+                                 // nocommit it seems to mess up offsets!?
+                                 WikipediaTokenizer.class
+                                 );
+  }
   @BeforeClass
   public static void beforeClass() throws Exception {
     List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
@@ -146,7 +161,6 @@ public class TestRandomChains extends Ba
       ) {
         continue;
       }
-
       for (final Constructor<?> ctor : c.getConstructors()) {
         // don't test synthetic or deprecated ctors, they likely have known bugs:
         if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class)) {
@@ -154,22 +168,21 @@ public class TestRandomChains extends Ba
         }
         if (Tokenizer.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-            allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+                     allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           tokenizers.add(castConstructor(Tokenizer.class, ctor));
         } else if (TokenFilter.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-            allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+                     allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           tokenfilters.add(castConstructor(TokenFilter.class, ctor));
         } else if (CharStream.class.isAssignableFrom(c)) {
           assertTrue(ctor.toGenericString() + " has unsupported parameter types",
-            allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
+                     allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
           charfilters.add(castConstructor(CharStream.class, ctor));
         } else {
           fail("Cannot get here");
         }
       }
     }
-
     final Comparator<Constructor<?>> ctorComp = new Comparator<Constructor<?>>() {
       @Override
       public int compare(Constructor<?> arg0, Constructor<?> arg1) {
@@ -179,28 +192,24 @@ public class TestRandomChains extends Ba
     Collections.sort(tokenizers, ctorComp);
     Collections.sort(tokenfilters, ctorComp);
     Collections.sort(charfilters, ctorComp);
-    
     if (VERBOSE) {
       System.out.println("tokenizers = " + tokenizers);
       System.out.println("tokenfilters = " + tokenfilters);
       System.out.println("charfilters = " + charfilters);
     }
   }
-  
   @AfterClass
   public static void afterClass() throws Exception {
     tokenizers = null;
     tokenfilters = null;
     charfilters = null;
   }
-  
   /** Hack to work around the stupidness of Oracle's strict Java backwards compatibility.
    * {@code Class<T>#getConstructors()} should return unmodifiable {@code List<Constructor<T>>} not array! */
   @SuppressWarnings("unchecked") 
   private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
     return (Constructor<T>) ctor;
   }
-  
   private static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
     final ClassLoader cld = TestRandomChains.class.getClassLoader();
     final String path = pckgname.replace('.', '/');
@@ -541,13 +550,21 @@ public class TestRandomChains extends Ba
     MockRandomAnalyzer(long seed) {
       this.seed = seed;
     }
+
+    public boolean offsetsAreCorrect() {
+      // nocommit: can we not do the full chain here!?
+      Random random = new Random(seed);
+      TokenizerSpec tokenizerSpec = newTokenizer(random, new StringReader(""));
+      TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+      return filterSpec.offsetsAreCorrect;
+    }
     
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
       Random random = new Random(seed);
-      TokenizerSpec tokenizerspec = newTokenizer(random, reader);
-      TokenFilterSpec filterspec = newFilterChain(random, tokenizerspec.tokenizer);
-      return new TokenStreamComponents(tokenizerspec.tokenizer, filterspec.stream);
+      TokenizerSpec tokenizerSpec = newTokenizer(random, reader);
+      TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+      return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
     }
 
     @Override
@@ -561,19 +578,21 @@ public class TestRandomChains extends Ba
     public String toString() {
       Random random = new Random(seed);
       StringBuilder sb = new StringBuilder();
-      CharFilterSpec charfilterSpec = newCharFilterChain(random, new StringReader(""));
+      CharFilterSpec charFilterSpec = newCharFilterChain(random, new StringReader(""));
       sb.append("\ncharfilters=");
-      sb.append(charfilterSpec.toString);
+      sb.append(charFilterSpec.toString);
       // intentional: initReader gets its own separate random
       random = new Random(seed);
-      TokenizerSpec tokenizerSpec = newTokenizer(random, charfilterSpec.reader);
+      TokenizerSpec tokenizerSpec = newTokenizer(random, charFilterSpec.reader);
       sb.append("\n");
       sb.append("tokenizer=");
       sb.append(tokenizerSpec.toString);
-      TokenFilterSpec tokenfilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
+      TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
       sb.append("\n");
       sb.append("filters=");
-      sb.append(tokenfilterSpec.toString);
+      sb.append(tokenFilterSpec.toString);
+      sb.append("\n");
+      sb.append("offsetsAreCorrect=" + tokenFilterSpec.offsetsAreCorrect);
       return sb.toString();
     }
     
@@ -620,6 +639,9 @@ public class TestRandomChains extends Ba
         final CheckThatYouDidntReadAnythingReaderWrapper wrapper = new CheckThatYouDidntReadAnythingReaderWrapper(reader);
         final Object args[] = newTokenizerArgs(random, wrapper, ctor.getParameterTypes());
         spec.tokenizer = createComponent(ctor, args, descr);
+        if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
+          spec.offsetsAreCorrect = false;
+        }
         if (spec.tokenizer == null) {
           assertFalse(ctor.getDeclaringClass().getName() + " has read something in ctor but failed with UOE/IAE", wrapper.readSomething);
         }
@@ -648,8 +670,9 @@ public class TestRandomChains extends Ba
       return spec;
     }
     
-    private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
+    private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
       TokenFilterSpec spec = new TokenFilterSpec();
+      spec.offsetsAreCorrect = offsetsAreCorrect;
       spec.stream = tokenizer;
       StringBuilder descr = new StringBuilder();
       int numFilters = random.nextInt(5);
@@ -658,13 +681,16 @@ public class TestRandomChains extends Ba
         // Insert ValidatingTF after each stage so we can
         // catch problems right after the TF that "caused"
         // them:
-        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
 
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
           final TokenFilter flt = createComponent(ctor, args, descr);
           if (flt != null) {
+            if (brokenOffsetsComponents.contains(ctor.getDeclaringClass())) {
+              spec.offsetsAreCorrect = false;
+            }
             spec.stream = flt;
             break;
           }
@@ -674,7 +700,7 @@ public class TestRandomChains extends Ba
       // Insert ValidatingTF after each stage so we can
       // catch problems right after the TF that "caused"
       // them:
-      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
 
       spec.toString = descr.toString();
       return spec;
@@ -722,11 +748,13 @@ public class TestRandomChains extends Ba
   static class TokenizerSpec {
     Tokenizer tokenizer;
     String toString;
+    boolean offsetsAreCorrect = true;
   }
   
   static class TokenFilterSpec {
     TokenStream stream;
     String toString;
+    boolean offsetsAreCorrect = true;
   }
   
   static class CharFilterSpec {
@@ -743,7 +771,8 @@ public class TestRandomChains extends Ba
         System.out.println("Creating random analyzer:" + a);
       }
       try {
-        checkRandomData(random, a, 1000);
+        checkRandomData(random, a, 1000, 20, false,
+                        false /* We already validate our own offsets... */);
       } catch (Throwable e) {
         System.err.println("Exception from random analyzer: " + a);
         throw e;

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java Tue Apr 10 16:54:54 2012
@@ -65,7 +65,11 @@ public class TestTrimFilter extends Base
         new String[] { "a", "b", "c", "" },
         new int[] { 1, 0, 1, 3 },
         new int[] { 2, 1, 2, 3 },
-        new int[] { 1, 1, 1, 1 });
+        null,
+        new int[] { 1, 1, 1, 1 },
+        null,
+        null,
+        false);
   }
   
   /**

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Tue Apr 10 16:54:54 2012
@@ -72,14 +72,16 @@ public class TestWordDelimiterFilter ext
     assertTokenStreamContents(wdf, 
         new String[] { "foo", "bar", "foobar" },
         new int[] { 5, 9, 5 }, 
-        new int[] { 8, 12, 12 });
+        new int[] { 8, 12, 12 },
+        null, null, null, null, false);
 
     wdf = new WordDelimiterFilter(new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar" },
         new int[] { 5, 5, 5 },
-        new int[] { 6, 6, 6 });
+        new int[] { 6, 6, 6 },
+        null, null, null, null, false);
   }
   
   @Test
@@ -123,7 +125,8 @@ public class TestWordDelimiterFilter ext
     assertTokenStreamContents(wdf,
         new String[] { "foo", "bar", "foobar"},
         new int[] { 8, 12, 8 },
-        new int[] { 11, 15, 15 });
+        new int[] { 11, 15, 15 },
+        null, null, null, null, false);
   }
 
   public void doSplit(final String input, String... output) throws Exception {
@@ -230,18 +233,27 @@ public class TestWordDelimiterFilter ext
     assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
         new int[] { 0, 9 },
         new int[] { 6, 13 },
-        new int[] { 1, 1 });
+        null,
+        new int[] { 1, 1 },
+        null,
+        false);
     
     /* only in this case, posInc of 2 ?! */
     assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
         new int[] { 0, 9, 12, 9 },
         new int[] { 6, 12, 13, 13 },
-        new int[] { 1, 1, 1, 0 });
+        null,
+        new int[] { 1, 1, 1, 0 },
+        null,
+        false);
     
     assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
-        new int[] { 1, 1, 1 });
+        null,
+        new int[] { 1, 1, 1 },
+        null,
+        false);
     
     /* analyzer that will consume tokens with large position increments */
     Analyzer a2 = new Analyzer() {
@@ -258,24 +270,36 @@ public class TestWordDelimiterFilter ext
     assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
         new int[] { 0, 7, 16 },
         new int[] { 6, 15, 20 },
-        new int[] { 1, 10, 1 });
+        null,
+        new int[] { 1, 10, 1 },
+        null,
+        false);
     
     /* the "/" had a position increment of 10, where did it go?!?!! */
     assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
         new int[] { 0, 9 },
         new int[] { 6, 13 },
-        new int[] { 1, 11 });
+        null,
+        new int[] { 1, 11 },
+        null,
+        false);
     
     /* in this case, the increment of 10 from the "/" is carried over */
     assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
         new int[] { 0, 9, 12, 9 },
         new int[] { 6, 12, 13, 13 },
-        new int[] { 1, 11, 1, 0 });
+        null,
+        new int[] { 1, 11, 1, 0 },
+        null,
+        false);
     
     assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
-        new int[] { 1, 11, 1 });
+        null,
+        new int[] { 1, 11, 1 },
+        null,
+        false);
 
     Analyzer a3 = new Analyzer() {
       @Override
@@ -292,14 +316,20 @@ public class TestWordDelimiterFilter ext
         new String[] { "lucene", "solr", "lucenesolr" },
         new int[] { 0, 7, 0 },
         new int[] { 6, 11, 11 },
-        new int[] { 1, 1, 0 });
+        null,
+        new int[] { 1, 1, 0 },
+        null,
+        false);
 
     /* the stopword should add a gap here */
     assertAnalyzesTo(a3, "the lucene.solr", 
         new String[] { "lucene", "solr", "lucenesolr" }, 
         new int[] { 4, 11, 4 }, 
         new int[] { 10, 15, 15 },
-        new int[] { 2, 1, 0 });
+        null,
+        new int[] { 2, 1, 0 },
+        null,
+        false);
   }
   
   /** blast some random strings through the analyzer */
@@ -322,7 +352,7 @@ public class TestWordDelimiterFilter ext
           return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
         }
       };
-      checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+      checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
     }
   }
   

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Tue Apr 10 16:54:54 2012
@@ -94,7 +94,15 @@ public class EdgeNGramTokenFilterTest ex
 
   public void testBackRangeOfNgrams() throws Exception {
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
+    assertTokenStreamContents(tokenizer,
+                              new String[]{"e","de","cde"},
+                              new int[]{4,3,2},
+                              new int[]{5,5,5},
+                              null,
+                              null,
+                              null,
+                              null,
+                              false);
   }
   
   public void testSmallTokenInStream() throws Exception {
@@ -151,7 +159,7 @@ public class EdgeNGramTokenFilterTest ex
             new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
       }    
     };
-    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
   }
   
   public void testEmptyTerm() throws Exception {

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Tue Apr 10 16:54:54 2012
@@ -90,7 +90,7 @@ public class EdgeNGramTokenizerTest exte
 
   public void testBackRangeOfNgrams() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
+    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, null, null, null, 5 /* abcde */, false);
   }
   
   public void testReset() throws Exception {
@@ -109,8 +109,8 @@ public class EdgeNGramTokenizerTest exte
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
-    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
     
     Analyzer b = new Analyzer() {
       @Override
@@ -119,7 +119,7 @@ public class EdgeNGramTokenizerTest exte
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
-    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER, 20, false, false);
+    checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192, false, false);
   }
 }

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Tue Apr 10 16:54:54 2012
@@ -77,7 +77,8 @@ public class NGramTokenFilterTest extend
     assertTokenStreamContents(filter,
         new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
         new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
+        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+        null, null, null, null, false
         );
   }
   
@@ -130,7 +131,7 @@ public class NGramTokenFilterTest extend
             new NGramTokenFilter(tokenizer, 2, 15));
       }    
     };
-    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
   }
   
   public void testEmptyTerm() throws Exception {

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1311864&r1=1311863&r2=1311864&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Tue Apr 10 16:54:54 2012
@@ -73,7 +73,11 @@ public class NGramTokenizerTest extends 
         new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
         new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
         new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
-        5 /* abcde */
+        null,
+        null,
+        null,
+        5 /* abcde */,
+        false
         );
   }
   
@@ -98,7 +102,7 @@ public class NGramTokenizerTest extends 
         return new TokenStreamComponents(tokenizer, tokenizer);
       }    
     };
-    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
+    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
   }
 }