You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/03/24 21:10:42 UTC

svn commit: r1304904 - in /lucene/dev/branches/branch_3x/lucene: contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/ contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/ test-framework/src/java/org/apache/lucene...

Author: sarowe
Date: Sat Mar 24 20:10:42 2012
New Revision: 1304904

URL: http://svn.apache.org/viewvc?rev=1304904&view=rev
Log:
LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>

Modified:
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
    lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java Sat Mar 24 20:10:42 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/24/12 10:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 12:34 PM */
 
 package org.apache.lucene.analysis.charfilter;
 
@@ -40,7 +40,7 @@ import org.apache.lucene.analysis.util.O
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 1/24/12 10:07 AM from the specification file
+ * on 3/24/12 12:34 PM from the specification file
  * <tt>C:/cygwin/home/s/svn/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
  */
 public final class HTMLStripCharFilter extends BaseCharFilter {
@@ -30978,7 +30978,9 @@ public final class HTMLStripCharFilter e
     case START_TAG_TAIL_EXCLUDE:
     case SERVER_SIDE_INCLUDE:
     case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      // add (length of input that won't be output) [ - (substitution length) = 0 ]
       cumulativeDiff += yychar - inputStart;
+      // position the correction at (already output length) [ + (substitution length) = 0 ]
       addOffCorrectMap(outputCharCount, cumulativeDiff);
       outputSegment.clear();
       eofReturnValue = -1;
@@ -30986,7 +30988,9 @@ public final class HTMLStripCharFilter e
     }
     case CHARACTER_REFERENCE_TAIL: {        // Substitute
       // At end of file, allow char refs without semicolons
+      // add (length of input that won't be output) - (substitution length)
       cumulativeDiff += inputSegment.length() - outputSegment.length();
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
       eofReturnValue = outputSegment.nextChar();
       break;
@@ -31098,17 +31102,10 @@ public final class HTMLStripCharFilter e
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 34: 
-          { cumulativeDiff += yychar - inputStart + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
-          }
-        case 54: break;
         case 16: 
           { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
           }
-        case 55: break;
+        case 54: break;
         case 18: 
           { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
@@ -31118,11 +31115,11 @@ public final class HTMLStripCharFilter e
       yybegin(END_TAG_TAIL_SUBSTITUTE);
     }
           }
-        case 56: break;
+        case 55: break;
         case 35: 
           { yybegin(SCRIPT);
           }
-        case 57: break;
+        case 56: break;
         case 46: 
           { yybegin(SCRIPT);
     if (escapeSCRIPT) {
@@ -31132,24 +31129,17 @@ public final class HTMLStripCharFilter e
       return outputSegment.nextChar();
     }
           }
-        case 58: break;
+        case 57: break;
         case 22: 
           { previousRestoreState = restoreState;
     restoreState = SERVER_SIDE_INCLUDE;
     yybegin(DOUBLE_QUOTED_STRING);
           }
-        case 59: break;
+        case 58: break;
         case 40: 
           { yybegin(SCRIPT_COMMENT);
           }
-        case 60: break;
-        case 47: 
-          { cumulativeDiff += inputSegment.length() + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(CDATA);
-          }
-        case 61: break;
+        case 59: break;
         case 31: 
           { int matchLength = yylength();
     inputSegment.write(zzBuffer, zzStartRead, matchLength);
@@ -31184,7 +31174,43 @@ public final class HTMLStripCharFilter e
       return outputSegment.nextChar();
     }
           }
-        case 62: break;
+        case 60: break;
+        case 53: 
+          { // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      char lowSurrogate = '\u0000';
+      try { // Low surrogates are in decimal range [56320, 57343]
+        lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(9, 14) + "'";
+      }
+      if (Character.isLowSurrogate(lowSurrogate)) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        outputSegment.unsafeWrite(lowSurrogate);
+        // add (previously matched input length) + (this match length) - (substitution length)
+        cumulativeDiff += inputSegment.length() + yylength() - 2;
+        // position the correction at (already output length) + (substitution length)
+        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+        inputSegment.clear();
+        yybegin(YYINITIAL);
+        return highSurrogate;
+      }
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+          }
+        case 61: break;
         case 30: 
           { int length = yylength();
     inputSegment.write(zzBuffer, zzStartRead, length);
@@ -31194,15 +31220,7 @@ public final class HTMLStripCharFilter e
     outputSegment = entitySegment;
     yybegin(CHARACTER_REFERENCE_TAIL);
           }
-        case 63: break;
-        case 7: 
-          { cumulativeDiff
-        += inputSegment.length() + yylength() - outputSegment.length();
-    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
-    yybegin(YYINITIAL);
-    return outputSegment.nextChar();
-          }
-        case 64: break;
+        case 62: break;
         case 6: 
           { int matchLength = yylength();
     inputSegment.write(zzBuffer, zzStartRead, matchLength);
@@ -31236,55 +31254,18 @@ public final class HTMLStripCharFilter e
       return outputSegment.nextChar();
     }
           }
-        case 65: break;
+        case 63: break;
         case 29: 
           { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
           }
-        case 66: break;
-        case 52: 
-          { // Handle paired UTF-16 surrogates.
-    String surrogatePair = yytext();
-    char highSurrogate = '\u0000';
-    try { // High surrogates are in decimal range [55296, 56319]
-      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
-    } catch(Exception e) { // should never happen
-      assert false: "Exception parsing high surrogate '"
-                  + surrogatePair.substring(1, 6) + "'";
-    }
-    if (Character.isHighSurrogate(highSurrogate)) {
-      outputSegment = entitySegment;
-      outputSegment.clear();
-      try {
-        outputSegment.unsafeWrite
-            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
-      } catch(Exception e) { // should never happen
-        assert false: "Exception parsing low surrogate '"
-                    + surrogatePair.substring(10, 14) + "'";
-      }
-      cumulativeDiff += inputSegment.length() + yylength() - 2;
-      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
-      inputSegment.clear();
-      yybegin(YYINITIAL);
-      return highSurrogate;
-    }
-    yypushback(surrogatePair.length() - 1); // Consume only '#'
-    inputSegment.append('#');
-    yybegin(NUMERIC_CHARACTER);
-          }
-        case 67: break;
+        case 64: break;
         case 3: 
           { inputStart = yychar;
   inputSegment.clear();
   inputSegment.append('&');
   yybegin(AMPERSAND);
           }
-        case 68: break;
-        case 37: 
-          { cumulativeDiff += yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    yybegin(YYINITIAL);
-          }
-        case 69: break;
+        case 65: break;
         case 8: 
           { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
@@ -31294,11 +31275,21 @@ public final class HTMLStripCharFilter e
       yybegin(START_TAG_TAIL_SUBSTITUTE);
     }
           }
-        case 70: break;
+        case 66: break;
+        case 27: 
+          { // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+          }
+        case 67: break;
         case 38: 
           { yybegin(restoreState);
           }
-        case 71: break;
+        case 68: break;
         case 19: 
           { inputSegment.write(zzBuffer, zzStartRead, yylength());
     if (null != escapedTags
@@ -31308,10 +31299,45 @@ public final class HTMLStripCharFilter e
       yybegin(END_TAG_TAIL_EXCLUDE);
     }
           }
-        case 72: break;
+        case 69: break;
+        case 26: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    outputSegment = inputSegment;
+    yybegin(YYINITIAL);
+          }
+        case 70: break;
         case 13: 
           { inputSegment.append(zzBuffer[zzStartRead]);
           }
+        case 71: break;
+        case 36: 
+          { yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_END_TAG_REPLACEMENT;
+    }
+          }
+        case 72: break;
+        case 47: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(CDATA);
+          }
         case 73: break;
         case 28: 
           { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
@@ -31321,11 +31347,11 @@ public final class HTMLStripCharFilter e
           { inputSegment.write(zzBuffer, zzStartRead, yylength());
           }
         case 75: break;
-        case 26: 
-          { cumulativeDiff += inputSegment.length() + yylength();
+        case 37: 
+          { // add (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    outputSegment = inputSegment;
     yybegin(YYINITIAL);
           }
         case 76: break;
@@ -31351,71 +31377,59 @@ public final class HTMLStripCharFilter e
           { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
           }
         case 80: break;
+        case 24: 
+          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+          }
+        case 81: break;
         case 49: 
           { inputSegment.clear();
     yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
     cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
     int returnValue;
     if (escapeSCRIPT) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
       returnValue = outputSegment.nextChar();
     } else {
+      // add (this match length) - (substitution length)
       cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
+      ++offsetCorrectionPos;
       returnValue = SCRIPT_REPLACEMENT;
     }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
     return returnValue;
           }
-        case 81: break;
-        case 27: 
-          { cumulativeDiff += inputSegment.length() + yylength() - 1;
-    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
-    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
-          }
         case 82: break;
-        case 24: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
-     outputSegment = inputSegment;
-     yybegin(YYINITIAL);
-     return outputSegment.nextChar();
-          }
-        case 83: break;
         case 2: 
           { inputStart = yychar;
   inputSegment.clear();
   inputSegment.append('<');
   yybegin(LEFT_ANGLE_BRACKET);
           }
+        case 83: break;
+        case 14: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+          }
         case 84: break;
-        case 50: 
-          { // Handle paired UTF-16 surrogates.
-    outputSegment = entitySegment;
-    outputSegment.clear();
-    String surrogatePair = yytext();
-    char highSurrogate = '\u0000';
-    try {
-      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
-    } catch(Exception e) { // should never happen
-      assert false: "Exception parsing high surrogate '"
-                  + surrogatePair.substring(2, 6) + "'";
-    }
-    try {
-      outputSegment.unsafeWrite
-          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
-    } catch(Exception e) { // should never happen
-      assert false: "Exception parsing low surrogate '"
-                  + surrogatePair.substring(10, 14) + "'";
-    }
-    cumulativeDiff += inputSegment.length() + yylength() - 2;
-    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+        case 34: 
+          { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
+    cumulativeDiff += yychar - inputStart + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
-    return highSurrogate;
           }
         case 85: break;
         case 32: 
@@ -31430,6 +31444,15 @@ public final class HTMLStripCharFilter e
           { yybegin(STYLE);
           }
         case 88: break;
+        case 7: 
+          { // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+          }
+        case 89: break;
         case 4: 
           { yypushback(1);
     outputSegment = inputSegment;
@@ -31437,14 +31460,6 @@ public final class HTMLStripCharFilter e
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
           }
-        case 89: break;
-        case 25: 
-          { cumulativeDiff += inputSegment.length() + yylength() - 1;
-    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
-    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
-          }
         case 90: break;
         case 12: 
           { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
@@ -31462,6 +31477,34 @@ public final class HTMLStripCharFilter e
           { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
           }
         case 94: break;
+        case 50: 
+          { // Handle paired UTF-16 surrogates.
+    outputSegment = entitySegment;
+    outputSegment.clear();
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try {
+      outputSegment.unsafeWrite
+          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(10, 14) + "'";
+    }
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 2;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return highSurrogate;
+          }
+        case 95: break;
         case 51: 
           { // Handle paired UTF-16 surrogates.
     String surrogatePair = yytext();
@@ -31483,7 +31526,9 @@ public final class HTMLStripCharFilter e
       outputSegment = entitySegment;
       outputSegment.clear();
       outputSegment.unsafeWrite(lowSurrogate);
+      // add (previously matched input length) + (this match length) - (substitution length)
       cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
       inputSegment.clear();
       yybegin(YYINITIAL);
@@ -31493,12 +31538,26 @@ public final class HTMLStripCharFilter e
     inputSegment.append('#');
     yybegin(NUMERIC_CHARACTER);
           }
-        case 95: break;
+        case 96: break;
+        case 25: 
+          { // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+          }
+        case 97: break;
         case 11: 
           { inputSegment.write(zzBuffer, zzStartRead, yylength());
     yybegin(LEFT_ANGLE_BRACKET_SPACE);
           }
-        case 96: break;
+        case 98: break;
+        case 44: 
+          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+          }
+        case 99: break;
         case 33: 
           { yybegin(YYINITIAL);
     if (escapeBR) {
@@ -31506,23 +31565,26 @@ public final class HTMLStripCharFilter e
       outputSegment = inputSegment;
       return outputSegment.nextChar();
     } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
       inputSegment.reset();
       return BR_START_TAG_REPLACEMENT;
     }
           }
-        case 97: break;
-        case 44: 
-          { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
-          }
-        case 98: break;
+        case 100: break;
         case 17: 
           { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
           }
-        case 99: break;
-        case 53: 
+        case 101: break;
+        case 21: 
+          { previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(SINGLE_QUOTED_STRING);
+          }
+        case 102: break;
+        case 52: 
           { // Handle paired UTF-16 surrogates.
     String surrogatePair = yytext();
     char highSurrogate = '\u0000';
@@ -31533,89 +31595,64 @@ public final class HTMLStripCharFilter e
                   + surrogatePair.substring(1, 6) + "'";
     }
     if (Character.isHighSurrogate(highSurrogate)) {
-      char lowSurrogate = '\u0000';
-      try { // Low surrogates are in decimal range [56320, 57343]
-        lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      try {
+        outputSegment.unsafeWrite
+            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
       } catch(Exception e) { // should never happen
         assert false: "Exception parsing low surrogate '"
-                    + surrogatePair.substring(9, 14) + "'";
-      }
-      if (Character.isLowSurrogate(lowSurrogate)) {
-        outputSegment = entitySegment;
-        outputSegment.clear();
-        outputSegment.unsafeWrite(lowSurrogate);
-        cumulativeDiff += inputSegment.length() + yylength() - 2;
-        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
-        inputSegment.clear();
-        yybegin(YYINITIAL);
-        return highSurrogate;
+                    + surrogatePair.substring(10, 14) + "'";
       }
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
     }
     yypushback(surrogatePair.length() - 1); // Consume only '#'
     inputSegment.append('#');
     yybegin(NUMERIC_CHARACTER);
           }
-        case 100: break;
-        case 14: 
-          { cumulativeDiff += inputSegment.length() + yylength();
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(YYINITIAL);
+        case 103: break;
+        case 9: 
+          { inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_EXCLUDE);
+    }
           }
-        case 101: break;
-        case 21: 
-          { previousRestoreState = restoreState;
-    restoreState = SERVER_SIDE_INCLUDE;
-    yybegin(SINGLE_QUOTED_STRING);
+        case 104: break;
+        case 15: 
+          { 
           }
-        case 102: break;
+        case 105: break;
         case 48: 
           { inputSegment.clear();
     yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
     cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position the offset correction at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
     int returnValue;
     if (escapeSTYLE) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
       returnValue = outputSegment.nextChar();
     } else {
+      // add (this match length) - (substitution length)
       cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
+      ++offsetCorrectionPos;
       returnValue = STYLE_REPLACEMENT;
     }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
     return returnValue;
           }
-        case 103: break;
-        case 9: 
-          { inputSegment.write(zzBuffer, zzStartRead, yylength());
-    if (null != escapedTags
-        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
-      yybegin(START_TAG_TAIL_INCLUDE);
-    } else {
-      yybegin(START_TAG_TAIL_EXCLUDE);
-    }
-          }
-        case 104: break;
-        case 36: 
-          { yybegin(YYINITIAL);
-    if (escapeBR) {
-      inputSegment.write(zzBuffer, zzStartRead, yylength());
-      outputSegment = inputSegment;
-      return outputSegment.nextChar();
-    } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
-      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
-      inputSegment.reset();
-      return BR_END_TAG_REPLACEMENT;
-    }
-          }
-        case 105: break;
-        case 15: 
-          { 
-          }
         case 106: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Sat Mar 24 20:10:42 2012
@@ -294,7 +294,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
     case START_TAG_TAIL_EXCLUDE:
     case SERVER_SIDE_INCLUDE:
     case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      // add (length of input that won't be output) [ - (substitution length) = 0 ]
       cumulativeDiff += yychar - inputStart;
+      // position the correction at (already output length) [ + (substitution length) = 0 ]
       addOffCorrectMap(outputCharCount, cumulativeDiff);
       outputSegment.clear();
       eofReturnValue = -1;
@@ -302,7 +304,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
     }
     case CHARACTER_REFERENCE_TAIL: {        // Substitute
       // At end of file, allow char refs without semicolons
+      // add (length of input that won't be output) - (substitution length)
       cumulativeDiff += inputSegment.length() - outputSegment.length();
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
       eofReturnValue = outputSegment.nextChar();
       break;
@@ -375,7 +379,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
       assert false: "Exception parsing low surrogate '"
                   + surrogatePair.substring(10, 14) + "'";
     }
+    // add (previously matched input length) + (this match length) - (substitution length)
     cumulativeDiff += inputSegment.length() + yylength() - 2;
+    // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -404,7 +410,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
         assert false: "Exception parsing low surrogate '"
                     + surrogatePair.substring(10, 14) + "'";
       }
+      // add (previously matched input length) + (this match length) - (substitution length)
       cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
       inputSegment.clear();
       yybegin(YYINITIAL);
@@ -438,7 +446,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
       outputSegment = entitySegment;
       outputSegment.clear();
       outputSegment.unsafeWrite(lowSurrogate);
+      // add (previously matched input length) + (this match length) - (substitution length)
       cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
       inputSegment.clear();
       yybegin(YYINITIAL);
@@ -473,7 +483,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
         outputSegment = entitySegment;
         outputSegment.clear();
         outputSegment.unsafeWrite(lowSurrogate);
+        // add (previously matched input length) + (this match length) - (substitution length)
         cumulativeDiff += inputSegment.length() + yylength() - 2;
+        // position the correction at (already output length) + (substitution length)
         addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
         inputSegment.clear();
         yybegin(YYINITIAL);
@@ -558,8 +570,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 
 <CHARACTER_REFERENCE_TAIL> {
   ";" {
-    cumulativeDiff
-        += inputSegment.length() + yylength() - outputSegment.length();
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+    // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
     yybegin(YYINITIAL);
     return outputSegment.nextChar();
@@ -575,9 +588,10 @@ InlineElment = ( [aAbBiIqQsSuU]         
       outputSegment = inputSegment;
       return outputSegment.nextChar();
     } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
-      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
       inputSegment.reset();
       return BR_END_TAG_REPLACEMENT;
     }
@@ -613,7 +627,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 
 <END_TAG_TAIL_EXCLUDE> {
   \s* ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -622,7 +638,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 
 <END_TAG_TAIL_SUBSTITUTE> {
   \s* ">" {
+    // add (previously matched input length) + (this match length) - (substitution length)
     cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -638,7 +656,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
     yybegin(LEFT_ANGLE_BRACKET_SPACE);
   }
   "?" [^>]* [/?] ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -650,8 +670,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
       outputSegment = inputSegment;
       return outputSegment.nextChar();
     } else {
-      cumulativeDiff
-          += inputSegment.length() + yylength() - outputSegment.length();
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
       inputSegment.reset();
       return BR_START_TAG_REPLACEMENT;
@@ -709,7 +730,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 
 <START_TAG_TAIL_EXCLUDE> {
    ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     outputSegment = inputSegment;
@@ -719,7 +742,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 
 <START_TAG_TAIL_SUBSTITUTE> {
   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
+    // add (previously matched input length) + (this match length) - (substitution length)
     cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
     addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -730,7 +755,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 <BANG> {
   "--" { yybegin(COMMENT); }
   ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -743,7 +770,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
   // [21] CDEnd   ::= ']]>'
   //
   "[CDATA[" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(CDATA);
@@ -755,7 +784,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 
 <CDATA> {
   "]]>" {
+    // add (this match length) [ - (substitution length) = 0 ]
     cumulativeDiff += yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     yybegin(YYINITIAL);
   }
@@ -765,7 +796,9 @@ InlineElment = ( [aAbBiIqQsSuU]         
 <COMMENT> {
   "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
   "-->" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
     cumulativeDiff += yychar - inputStart + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0]
     addOffCorrectMap(outputCharCount, cumulativeDiff);
     inputSegment.clear();
     yybegin(YYINITIAL);
@@ -821,19 +854,23 @@ InlineElment = ( [aAbBiIqQsSuU]         
   "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
     inputSegment.clear();
     yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
     cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
     int returnValue;
     if (escapeSCRIPT) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
       returnValue = outputSegment.nextChar();
     } else {
+      // add (this match length) - (substitution length)
       cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
+      ++offsetCorrectionPos;
       returnValue = SCRIPT_REPLACEMENT;
     }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
     return returnValue;
   }
   [^] { }
@@ -844,19 +881,23 @@ InlineElment = ( [aAbBiIqQsSuU]         
   "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
     inputSegment.clear();
     yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
     cumulativeDiff += yychar - inputStart;
-    int outputEnd = outputCharCount;
+    // position the offset correction at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
     int returnValue;
     if (escapeSTYLE) {
       inputSegment.write(zzBuffer, zzStartRead, yylength());
       outputSegment = inputSegment;
       returnValue = outputSegment.nextChar();
     } else {
+      // add (this match length) - (substitution length)
       cumulativeDiff += yylength() - 1;
-      ++outputEnd;
+      // add (substitution length)
+      ++offsetCorrectionPos;
       returnValue = STYLE_REPLACEMENT;
     }
-    addOffCorrectMap(outputEnd, cumulativeDiff);
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
     return returnValue;
   }
   [^] { }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Sat Mar 24 20:10:42 2012
@@ -32,12 +32,26 @@ import org.apache.lucene.analysis.BaseTo
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.ReusableAnalyzerBase;
-import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util._TestUtil;
 
 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
 
+  static private Analyzer newTestAnalyzer() {
+    return new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new HTMLStripCharFilter(CharReader.get(reader));
+      }
+    };
+  }
+
   //this is some text  here is a  link  and another  link . This is an entity: & plus a <.  Here is an &
   //
   public void test() throws IOException {
@@ -495,41 +509,17 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testRandom() throws Exception {
-    Analyzer analyzer = new ReusableAnalyzerBase() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }
-
-      @Override
-      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(reader));
-      }
-    };
-    
     int numRounds = RANDOM_MULTIPLIER * 10000;
-    checkRandomData(random, analyzer, numRounds);
+    checkRandomData(random, newTestAnalyzer(), numRounds);
   }
   
   public void testRandomHugeStrings() throws Exception {
-    Analyzer analyzer = new ReusableAnalyzerBase() {
-
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }
-
-      @Override
-      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(reader));
-      }
-    };
-    
     int numRounds = RANDOM_MULTIPLIER * 200;
-    checkRandomData(random, analyzer, numRounds, 8192);
+    checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
+  }
+
+  public void testCloseBR() throws Exception {
+    checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
   }
   
   public void testServerSideIncludes() throws Exception {
@@ -799,9 +789,7 @@ public class HTMLStripCharFilterTest ext
   public void testRandomBrokenHTML() throws Exception {
     int maxNumElements = 10000;
     String text = _TestUtil.randomHtmlishString(random, maxNumElements);
-    Reader reader = new HTMLStripCharFilter
-        (CharReader.get(new StringReader(text)));
-    while (reader.read() != -1);
+    checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
   }
 
   public void testRandomText() throws Exception {
@@ -840,18 +828,7 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testUTF16Surrogates() throws Exception {
-    Analyzer analyzer = new ReusableAnalyzerBase() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }
-
-      @Override
-      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
-      }
-    };
+    Analyzer analyzer = newTestAnalyzer();
     // Paired surrogates
     assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
         new String[] { "one", "two", "\uD86C\uDC01three" } );

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 24 20:10:42 2012
@@ -391,188 +391,193 @@ public abstract class BaseTokenStreamTes
       }
 
 
-      if (VERBOSE) {
-        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
-      }
+      checkAnalysisConsistency(random, a, useCharFilter, text);
+    }
+  }
 
-      int remainder = random.nextInt(10);
-      Reader reader = new StringReader(text);
-      TokenStream ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
-      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
-      OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
-      PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
-      PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
-      TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
-      List<String> tokens = new ArrayList<String>();
-      List<String> types = new ArrayList<String>();
-      List<Integer> positions = new ArrayList<Integer>();
-      List<Integer> positionLengths = new ArrayList<Integer>();
-      List<Integer> startOffsets = new ArrayList<Integer>();
-      List<Integer> endOffsets = new ArrayList<Integer>();
-      ts.reset();
-
-      // First pass: save away "correct" tokens
-      while (ts.incrementToken()) {
-        tokens.add(termAtt.toString());
-        if (typeAtt != null) types.add(typeAtt.type());
-        if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
-        if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
-        if (offsetAtt != null) {
-          startOffsets.add(offsetAtt.startOffset());
-          endOffsets.add(offsetAtt.endOffset());
-        }
-      }
-      ts.end();
-      ts.close();
-      // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
-      if (!tokens.isEmpty()) {
-
-        // KWTokenizer (for example) can produce a token
-        // even when input is length 0:
-        if (text.length() != 0) {
-
-          // (Optional) second pass: do something evil:
-          final int evilness = random.nextInt(50);
-          if (evilness == 17) {
-            if (VERBOSE) {
-              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
-            }
-            // Throw an errant exception from the Reader:
+  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
 
-            MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
-            evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
-            reader = evilReader;
-
-            try {
-              // NOTE: some Tokenizers go and read characters
-              // when you call .setReader(Reader), eg
-              // PatternTokenizer.  This is a bit
-              // iffy... (really, they should only
-              // pull from the Reader when you call
-              // .incremenToken(), I think?), but we
-              // currently allow it, so, we must call
-              // a.tokenStream inside the try since we may
-              // hit the exc on init:
-              ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
-              ts.reset();
-              while (ts.incrementToken());
-              fail("did not hit exception");
-            } catch (RuntimeException re) {
-              assertTrue(MockReaderWrapper.isMyEvilException(re));
-            }
-            try {
-              ts.end();
-            } catch (AssertionError ae) {
-              // Catch & ignore MockTokenizer's
-              // anger...
-              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
-                // OK
-              } else {
-                throw ae;
-              }
-            }
-            ts.close();
-          } else if (evilness == 7) {
-            // Only consume a subset of the tokens:
-            final int numTokensToRead = random.nextInt(tokens.size());
-            if (VERBOSE) {
-              System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
-            }
+    if (VERBOSE) {
+      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+    }
+
+    int remainder = random.nextInt(10);
+    Reader reader = new StringReader(text);
+    TokenStream ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+    assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
+    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+    OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
+    PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
+    PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
+    TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
+    List<String> tokens = new ArrayList<String>();
+    List<String> types = new ArrayList<String>();
+    List<Integer> positions = new ArrayList<Integer>();
+    List<Integer> positionLengths = new ArrayList<Integer>();
+    List<Integer> startOffsets = new ArrayList<Integer>();
+    List<Integer> endOffsets = new ArrayList<Integer>();
+    ts.reset();
+
+    // First pass: save away "correct" tokens
+    while (ts.incrementToken()) {
+      tokens.add(termAtt.toString());
+      if (typeAtt != null) types.add(typeAtt.type());
+      if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
+      if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
+      if (offsetAtt != null) {
+        startOffsets.add(offsetAtt.startOffset());
+        endOffsets.add(offsetAtt.endOffset());
+      }
+    }
+    ts.end();
+    ts.close();
+    // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
+    if (!tokens.isEmpty()) {
+
+      // KWTokenizer (for example) can produce a token
+      // even when input is length 0:
+      if (text.length() != 0) {
+
+        // (Optional) second pass: do something evil:
+        final int evilness = random.nextInt(50);
+        if (evilness == 17) {
+          if (VERBOSE) {
+            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
+          }
+          // Throw an errant exception from the Reader:
 
-            reader = new StringReader(text);
-            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+          MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
+          evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
+          reader = evilReader;
+
+          try {
+            // NOTE: some Tokenizers go and read characters
+            // when you call .setReader(Reader), eg
+            // PatternTokenizer.  This is a bit
+            // iffy... (really, they should only
+            // pull from the Reader when you call
+            // .incremenToken(), I think?), but we
+            // currently allow it, so, we must call
+            // a.tokenStream inside the try since we may
+            // hit the exc on init:
+            ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
             ts.reset();
-            for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
-              assertTrue(ts.incrementToken());
+            while (ts.incrementToken());
+            fail("did not hit exception");
+          } catch (RuntimeException re) {
+            assertTrue(MockReaderWrapper.isMyEvilException(re));
+          }
+          try {
+            ts.end();
+          } catch (AssertionError ae) {
+            // Catch & ignore MockTokenizer's
+            // anger...
+            if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+              // OK
+            } else {
+              throw ae;
             }
-            try {
-              ts.end();
-            } catch (AssertionError ae) {
-              // Catch & ignore MockTokenizer's
-              // anger...
-              if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
-                // OK
-              } else {
-                throw ae;
-              }
+          }
+          ts.close();
+        } else if (evilness == 7) {
+          // Only consume a subset of the tokens:
+          final int numTokensToRead = random.nextInt(tokens.size());
+          if (VERBOSE) {
+            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
+          }
+
+          reader = new StringReader(text);
+          ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+          ts.reset();
+          for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
+            assertTrue(ts.incrementToken());
+          }
+          try {
+            ts.end();
+          } catch (AssertionError ae) {
+            // Catch & ignore MockTokenizer's
+            // anger...
+            if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+              // OK
+            } else {
+              throw ae;
             }
-            ts.close();
           }
+          ts.close();
         }
+      }
 
-        // Final pass: verify clean tokenization matches
-        // results from first pass:
+      // Final pass: verify clean tokenization matches
+      // results from first pass:
+
+      if (VERBOSE) {
+        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
+      }
+      reader = new StringReader(text);
 
+      if (random.nextInt(30) == 7) {
         if (VERBOSE) {
-          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
+          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
         }
-        reader = new StringReader(text);
 
-        if (random.nextInt(30) == 7) {
-          if (VERBOSE) {
-            System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
-          }
-
-          reader = new MockReaderWrapper(random, reader);
-        }
+        reader = new MockReaderWrapper(random, reader);
+      }
 
-        ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-        if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
-          // offset + pos + posLength + type
-          assertTokenStreamContents(ts, 
-            tokens.toArray(new String[tokens.size()]),
-            toIntArray(startOffsets),
-            toIntArray(endOffsets),
-            types.toArray(new String[types.size()]),
-            toIntArray(positions),
-            toIntArray(positionLengths),
-            text.length());
-        } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
-          // offset + pos + type
-          assertTokenStreamContents(ts, 
-            tokens.toArray(new String[tokens.size()]),
-            toIntArray(startOffsets),
-            toIntArray(endOffsets),
-            types.toArray(new String[types.size()]),
-            toIntArray(positions),
-            null,
-            text.length());
-        } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
-          // offset + pos + posLength
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]),
-              toIntArray(startOffsets),
-              toIntArray(endOffsets),
-              null,
-              toIntArray(positions),
-              toIntArray(positionLengths),
-              text.length());
-        } else if (posIncAtt != null && offsetAtt != null) {
-          // offset + pos
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]),
-              toIntArray(startOffsets),
-              toIntArray(endOffsets),
-              null,
-              toIntArray(positions),
-              null,
-              text.length());
-        } else if (offsetAtt != null) {
-          // offset
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]),
-              toIntArray(startOffsets),
-              toIntArray(endOffsets),
-              null,
-              null,
-              null,
-              text.length());
-        } else {
-          // terms only
-          assertTokenStreamContents(ts, 
-              tokens.toArray(new String[tokens.size()]));
-        }
+      ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+      if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+        // offset + pos + posLength + type
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  types.toArray(new String[types.size()]),
+                                  toIntArray(positions),
+                                  toIntArray(positionLengths),
+                                  text.length());
+      } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+        // offset + pos + type
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  types.toArray(new String[types.size()]),
+                                  toIntArray(positions),
+                                  null,
+                                  text.length());
+      } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+        // offset + pos + posLength
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  null,
+                                  toIntArray(positions),
+                                  toIntArray(positionLengths),
+                                  text.length());
+      } else if (posIncAtt != null && offsetAtt != null) {
+        // offset + pos
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  null,
+                                  toIntArray(positions),
+                                  null,
+                                  text.length());
+      } else if (offsetAtt != null) {
+        // offset
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]),
+                                  toIntArray(startOffsets),
+                                  toIntArray(endOffsets),
+                                  null,
+                                  null,
+                                  null,
+                                  text.length());
+      } else {
+        // terms only
+        assertTokenStreamContents(ts, 
+                                  tokens.toArray(new String[tokens.size()]));
       }
     }
   }

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java Sat Mar 24 20:10:42 2012
@@ -27,10 +27,7 @@ import java.io.OutputStream;
 import java.io.PrintStream;
 import java.lang.reflect.Method;
 import java.nio.CharBuffer;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Random;
+import java.util.*;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipFile;
 
@@ -429,12 +426,51 @@ public class _TestUtil {
         case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
         case 21: sb.append("\n"); break;
         case 22: sb.append("          ".substring(nextInt(random, 0, 10))); break;
+        case 23: {
+          sb.append("<");
+          if (0 == nextInt(random, 0, 3)) {
+            sb.append("          ".substring(nextInt(random, 1, 10)));
+          }
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append("/");
+            if (0 == nextInt(random, 0, 3)) {
+              sb.append("          ".substring(nextInt(random, 1, 10)));
+            }
+          }
+          switch (nextInt(random, 0, 3)) {
+            case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
+            case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
+            case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
+            // default: append nothing
+          }
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
         default: sb.append(randomSimpleString(random));
       }
     }
     return sb.toString();
   }
 
+  /**
+   * Randomly upcases, downcases, or leaves intact each code point in the given string
+   */
+  public static String randomlyRecaseCodePoints(Random random, String str) {
+    StringBuilder builder = new StringBuilder();
+    int pos = 0;
+    while (pos < str.length()) {
+      int codePoint = str.codePointAt(pos);
+      pos += Character.charCount(codePoint);
+      String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
+      switch (nextInt(random, 0, 2)) {
+        case 0: builder.append(codePointSubstring.toUpperCase()); break;
+        case 1: builder.append(codePointSubstring.toLowerCase()); break;
+        case 2: builder.append(codePointSubstring); // leave intact
+      }
+    }
+    return builder.toString();
+  }
+
   private static final int[] blockStarts = {
     0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, 
     0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800,