You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/03/24 21:10:42 UTC
svn commit: r1304904 - in /lucene/dev/branches/branch_3x/lucene:
contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/
contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/
test-framework/src/java/org/apache/lucene...
Author: sarowe
Date: Sat Mar 24 20:10:42 2012
New Revision: 1304904
URL: http://svn.apache.org/viewvc?rev=1304904&view=rev
Log:
LUCENE-3913: Fix HTMLStripCharFilter invalid final offset for input containing </br>
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java Sat Mar 24 20:10:42 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/24/12 10:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 12:34 PM */
package org.apache.lucene.analysis.charfilter;
@@ -40,7 +40,7 @@ import org.apache.lucene.analysis.util.O
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 1/24/12 10:07 AM from the specification file
+ * on 3/24/12 12:34 PM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
*/
public final class HTMLStripCharFilter extends BaseCharFilter {
@@ -30978,7 +30978,9 @@ public final class HTMLStripCharFilter e
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+ // add (length of input that won't be output) [ - (substitution length) = 0 ]
cumulativeDiff += yychar - inputStart;
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
@@ -30986,7 +30988,9 @@ public final class HTMLStripCharFilter e
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
+ // add (length of input that won't be output) - (substitution length)
cumulativeDiff += inputSegment.length() - outputSegment.length();
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
@@ -31098,17 +31102,10 @@ public final class HTMLStripCharFilter e
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
- case 34:
- { cumulativeDiff += yychar - inputStart + yylength();
- addOffCorrectMap(outputCharCount, cumulativeDiff);
- inputSegment.clear();
- yybegin(YYINITIAL);
- }
- case 54: break;
case 16:
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
- case 55: break;
+ case 54: break;
case 18:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@@ -31118,11 +31115,11 @@ public final class HTMLStripCharFilter e
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
- case 56: break;
+ case 55: break;
case 35:
{ yybegin(SCRIPT);
}
- case 57: break;
+ case 56: break;
case 46:
{ yybegin(SCRIPT);
if (escapeSCRIPT) {
@@ -31132,24 +31129,17 @@ public final class HTMLStripCharFilter e
return outputSegment.nextChar();
}
}
- case 58: break;
+ case 57: break;
case 22:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
- case 59: break;
+ case 58: break;
case 40:
{ yybegin(SCRIPT_COMMENT);
}
- case 60: break;
- case 47:
- { cumulativeDiff += inputSegment.length() + yylength();
- addOffCorrectMap(outputCharCount, cumulativeDiff);
- inputSegment.clear();
- yybegin(CDATA);
- }
- case 61: break;
+ case 59: break;
case 31:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
@@ -31184,7 +31174,43 @@ public final class HTMLStripCharFilter e
return outputSegment.nextChar();
}
}
- case 62: break;
+ case 60: break;
+ case 53:
+ { // Handle paired UTF-16 surrogates.
+ String surrogatePair = yytext();
+ char highSurrogate = '\u0000';
+ try { // High surrogates are in decimal range [55296, 56319]
+ highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing high surrogate '"
+ + surrogatePair.substring(1, 6) + "'";
+ }
+ if (Character.isHighSurrogate(highSurrogate)) {
+ char lowSurrogate = '\u0000';
+ try { // Low surrogates are in decimal range [56320, 57343]
+ lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing low surrogate '"
+ + surrogatePair.substring(9, 14) + "'";
+ }
+ if (Character.isLowSurrogate(lowSurrogate)) {
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ outputSegment.unsafeWrite(lowSurrogate);
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
+ }
+ }
+ yypushback(surrogatePair.length() - 1); // Consume only '#'
+ inputSegment.append('#');
+ yybegin(NUMERIC_CHARACTER);
+ }
+ case 61: break;
case 30:
{ int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
@@ -31194,15 +31220,7 @@ public final class HTMLStripCharFilter e
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
- case 63: break;
- case 7:
- { cumulativeDiff
- += inputSegment.length() + yylength() - outputSegment.length();
- addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
- yybegin(YYINITIAL);
- return outputSegment.nextChar();
- }
- case 64: break;
+ case 62: break;
case 6:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
@@ -31236,55 +31254,18 @@ public final class HTMLStripCharFilter e
return outputSegment.nextChar();
}
}
- case 65: break;
+ case 63: break;
case 29:
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
- case 66: break;
- case 52:
- { // Handle paired UTF-16 surrogates.
- String surrogatePair = yytext();
- char highSurrogate = '\u0000';
- try { // High surrogates are in decimal range [55296, 56319]
- highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
- } catch(Exception e) { // should never happen
- assert false: "Exception parsing high surrogate '"
- + surrogatePair.substring(1, 6) + "'";
- }
- if (Character.isHighSurrogate(highSurrogate)) {
- outputSegment = entitySegment;
- outputSegment.clear();
- try {
- outputSegment.unsafeWrite
- ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
- } catch(Exception e) { // should never happen
- assert false: "Exception parsing low surrogate '"
- + surrogatePair.substring(10, 14) + "'";
- }
- cumulativeDiff += inputSegment.length() + yylength() - 2;
- addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
- inputSegment.clear();
- yybegin(YYINITIAL);
- return highSurrogate;
- }
- yypushback(surrogatePair.length() - 1); // Consume only '#'
- inputSegment.append('#');
- yybegin(NUMERIC_CHARACTER);
- }
- case 67: break;
+ case 64: break;
case 3:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
- case 68: break;
- case 37:
- { cumulativeDiff += yylength();
- addOffCorrectMap(outputCharCount, cumulativeDiff);
- yybegin(YYINITIAL);
- }
- case 69: break;
+ case 65: break;
case 8:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@@ -31294,11 +31275,21 @@ public final class HTMLStripCharFilter e
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
- case 70: break;
+ case 66: break;
+ case 27:
+ { // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+ }
+ case 67: break;
case 38:
{ yybegin(restoreState);
}
- case 71: break;
+ case 68: break;
case 19:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@@ -31308,10 +31299,45 @@ public final class HTMLStripCharFilter e
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
- case 72: break;
+ case 69: break;
+ case 26:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+ cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ }
+ case 70: break;
case 13:
{ inputSegment.append(zzBuffer[zzStartRead]);
}
+ case 71: break;
+ case 36:
+ { yybegin(YYINITIAL);
+ if (escapeBR) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ return outputSegment.nextChar();
+ } else {
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+ inputSegment.reset();
+ return BR_END_TAG_REPLACEMENT;
+ }
+ }
+ case 72: break;
+ case 47:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+ cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(CDATA);
+ }
case 73: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
@@ -31321,11 +31347,11 @@ public final class HTMLStripCharFilter e
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
}
case 75: break;
- case 26:
- { cumulativeDiff += inputSegment.length() + yylength();
+ case 37:
+ { // add (this match length) [ - (substitution length) = 0 ]
+ cumulativeDiff += yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
- inputSegment.clear();
- outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 76: break;
@@ -31351,71 +31377,59 @@ public final class HTMLStripCharFilter e
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
}
case 80: break;
+ case 24:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+ case 81: break;
case 49:
{ inputSegment.clear();
yybegin(YYINITIAL);
+ // add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
- int outputEnd = outputCharCount;
+ // position at (already output length) -- substitution handled below
+ int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
+ // add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
- ++outputEnd;
+ // add (substitution length)
+ ++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
- addOffCorrectMap(outputEnd, cumulativeDiff);
+ addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
- case 81: break;
- case 27:
- { cumulativeDiff += inputSegment.length() + yylength() - 1;
- addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
- inputSegment.clear();
- yybegin(YYINITIAL);
- return BLOCK_LEVEL_START_TAG_REPLACEMENT;
- }
case 82: break;
- case 24:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
- outputSegment = inputSegment;
- yybegin(YYINITIAL);
- return outputSegment.nextChar();
- }
- case 83: break;
case 2:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
+ case 83: break;
+ case 14:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+ cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ }
case 84: break;
- case 50:
- { // Handle paired UTF-16 surrogates.
- outputSegment = entitySegment;
- outputSegment.clear();
- String surrogatePair = yytext();
- char highSurrogate = '\u0000';
- try {
- highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
- } catch(Exception e) { // should never happen
- assert false: "Exception parsing high surrogate '"
- + surrogatePair.substring(2, 6) + "'";
- }
- try {
- outputSegment.unsafeWrite
- ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
- } catch(Exception e) { // should never happen
- assert false: "Exception parsing low surrogate '"
- + surrogatePair.substring(10, 14) + "'";
- }
- cumulativeDiff += inputSegment.length() + yylength() - 2;
- addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ case 34:
+ { // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
+ cumulativeDiff += yychar - inputStart + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0]
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
- return highSurrogate;
}
case 85: break;
case 32:
@@ -31430,6 +31444,15 @@ public final class HTMLStripCharFilter e
{ yybegin(STYLE);
}
case 88: break;
+ case 7:
+ { // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+ case 89: break;
case 4:
{ yypushback(1);
outputSegment = inputSegment;
@@ -31437,14 +31460,6 @@ public final class HTMLStripCharFilter e
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
- case 89: break;
- case 25:
- { cumulativeDiff += inputSegment.length() + yylength() - 1;
- addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
- inputSegment.clear();
- yybegin(YYINITIAL);
- return BLOCK_LEVEL_END_TAG_REPLACEMENT;
- }
case 90: break;
case 12:
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
@@ -31462,6 +31477,34 @@ public final class HTMLStripCharFilter e
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 94: break;
+ case 50:
+ { // Handle paired UTF-16 surrogates.
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ String surrogatePair = yytext();
+ char highSurrogate = '\u0000';
+ try {
+ highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing high surrogate '"
+ + surrogatePair.substring(2, 6) + "'";
+ }
+ try {
+ outputSegment.unsafeWrite
+ ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing low surrogate '"
+ + surrogatePair.substring(10, 14) + "'";
+ }
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
+ }
+ case 95: break;
case 51:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
@@ -31483,7 +31526,9 @@ public final class HTMLStripCharFilter e
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -31493,12 +31538,26 @@ public final class HTMLStripCharFilter e
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
- case 95: break;
+ case 96: break;
+ case 25:
+ { // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+ }
+ case 97: break;
case 11:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
- case 96: break;
+ case 98: break;
+ case 44:
+ { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
+ }
+ case 99: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
@@ -31506,23 +31565,26 @@ public final class HTMLStripCharFilter e
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
- cumulativeDiff
- += inputSegment.length() + yylength() - outputSegment.length();
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
- case 97: break;
- case 44:
- { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
- }
- case 98: break;
+ case 100: break;
case 17:
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
- case 99: break;
- case 53:
+ case 101: break;
+ case 21:
+ { previousRestoreState = restoreState;
+ restoreState = SERVER_SIDE_INCLUDE;
+ yybegin(SINGLE_QUOTED_STRING);
+ }
+ case 102: break;
+ case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
@@ -31533,89 +31595,64 @@ public final class HTMLStripCharFilter e
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
- char lowSurrogate = '\u0000';
- try { // Low surrogates are in decimal range [56320, 57343]
- lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ try {
+ outputSegment.unsafeWrite
+ ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
- + surrogatePair.substring(9, 14) + "'";
- }
- if (Character.isLowSurrogate(lowSurrogate)) {
- outputSegment = entitySegment;
- outputSegment.clear();
- outputSegment.unsafeWrite(lowSurrogate);
- cumulativeDiff += inputSegment.length() + yylength() - 2;
- addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
- inputSegment.clear();
- yybegin(YYINITIAL);
- return highSurrogate;
+ + surrogatePair.substring(10, 14) + "'";
}
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
- case 100: break;
- case 14:
- { cumulativeDiff += inputSegment.length() + yylength();
- addOffCorrectMap(outputCharCount, cumulativeDiff);
- inputSegment.clear();
- yybegin(YYINITIAL);
+ case 103: break;
+ case 9:
+ { inputSegment.write(zzBuffer, zzStartRead, yylength());
+ if (null != escapedTags
+ && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+ yybegin(START_TAG_TAIL_INCLUDE);
+ } else {
+ yybegin(START_TAG_TAIL_EXCLUDE);
+ }
}
- case 101: break;
- case 21:
- { previousRestoreState = restoreState;
- restoreState = SERVER_SIDE_INCLUDE;
- yybegin(SINGLE_QUOTED_STRING);
+ case 104: break;
+ case 15:
+ {
}
- case 102: break;
+ case 105: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
+ // add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
- int outputEnd = outputCharCount;
+ // position the offset correction at (already output length) -- substitution handled below
+ int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
+ // add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
- ++outputEnd;
+ // add (substitution length)
+ ++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
- addOffCorrectMap(outputEnd, cumulativeDiff);
+ addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
- case 103: break;
- case 9:
- { inputSegment.write(zzBuffer, zzStartRead, yylength());
- if (null != escapedTags
- && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
- yybegin(START_TAG_TAIL_INCLUDE);
- } else {
- yybegin(START_TAG_TAIL_EXCLUDE);
- }
- }
- case 104: break;
- case 36:
- { yybegin(YYINITIAL);
- if (escapeBR) {
- inputSegment.write(zzBuffer, zzStartRead, yylength());
- outputSegment = inputSegment;
- return outputSegment.nextChar();
- } else {
- cumulativeDiff
- += inputSegment.length() + yylength() - outputSegment.length();
- addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
- inputSegment.reset();
- return BR_END_TAG_REPLACEMENT;
- }
- }
- case 105: break;
- case 15:
- {
- }
case 106: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Sat Mar 24 20:10:42 2012
@@ -294,7 +294,9 @@ InlineElment = ( [aAbBiIqQsSuU]
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+ // add (length of input that won't be output) [ - (substitution length) = 0 ]
cumulativeDiff += yychar - inputStart;
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
@@ -302,7 +304,9 @@ InlineElment = ( [aAbBiIqQsSuU]
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
+ // add (length of input that won't be output) - (substitution length)
cumulativeDiff += inputSegment.length() - outputSegment.length();
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
@@ -375,7 +379,9 @@ InlineElment = ( [aAbBiIqQsSuU]
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -404,7 +410,9 @@ InlineElment = ( [aAbBiIqQsSuU]
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -438,7 +446,9 @@ InlineElment = ( [aAbBiIqQsSuU]
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -473,7 +483,9 @@ InlineElment = ( [aAbBiIqQsSuU]
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -558,8 +570,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<CHARACTER_REFERENCE_TAIL> {
";" {
- cumulativeDiff
- += inputSegment.length() + yylength() - outputSegment.length();
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
@@ -575,9 +588,10 @@ InlineElment = ( [aAbBiIqQsSuU]
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
- cumulativeDiff
- += inputSegment.length() + yylength() - outputSegment.length();
- addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
@@ -613,7 +627,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<END_TAG_TAIL_EXCLUDE> {
\s* ">" {
+ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -622,7 +638,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<END_TAG_TAIL_SUBSTITUTE> {
\s* ">" {
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -638,7 +656,9 @@ InlineElment = ( [aAbBiIqQsSuU]
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
"?" [^>]* [/?] ">" {
+ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -650,8 +670,9 @@ InlineElment = ( [aAbBiIqQsSuU]
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
- cumulativeDiff
- += inputSegment.length() + yylength() - outputSegment.length();
+ // add (previously matched input length) + (this match length) - (substitution length)
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
@@ -709,7 +730,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<START_TAG_TAIL_EXCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
@@ -719,7 +742,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<START_TAG_TAIL_SUBSTITUTE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
+ // position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -730,7 +755,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<BANG> {
"--" { yybegin(COMMENT); }
">" {
+ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -743,7 +770,9 @@ InlineElment = ( [aAbBiIqQsSuU]
// [21] CDEnd ::= ']]>'
//
"[CDATA[" {
+ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
@@ -755,7 +784,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<CDATA> {
"]]>" {
+ // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
@@ -765,7 +796,9 @@ InlineElment = ( [aAbBiIqQsSuU]
<COMMENT> {
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"-->" {
+ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
+ // position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
@@ -821,19 +854,23 @@ InlineElment = ( [aAbBiIqQsSuU]
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
+ // add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
- int outputEnd = outputCharCount;
+ // position at (already output length) -- substitution handled below
+ int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
+ // add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
- ++outputEnd;
+ // add (substitution length)
+ ++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
- addOffCorrectMap(outputEnd, cumulativeDiff);
+ addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
[^] { }
@@ -844,19 +881,23 @@ InlineElment = ( [aAbBiIqQsSuU]
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
+ // add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
- int outputEnd = outputCharCount;
+ // position the offset correction at (already output length) -- substitution handled below
+ int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
+ // add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
- ++outputEnd;
+ // add (substitution length)
+ ++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
- addOffCorrectMap(outputEnd, cumulativeDiff);
+ addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
[^] { }
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Sat Mar 24 20:10:42 2012
@@ -32,12 +32,26 @@ import org.apache.lucene.analysis.BaseTo
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
-import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ static private Analyzer newTestAnalyzer() {
+ return new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new HTMLStripCharFilter(CharReader.get(reader));
+ }
+ };
+ }
+
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
//
public void test() throws IOException {
@@ -495,41 +509,17 @@ public class HTMLStripCharFilterTest ext
}
public void testRandom() throws Exception {
- Analyzer analyzer = new ReusableAnalyzerBase() {
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
-
- @Override
- protected Reader initReader(Reader reader) {
- return new HTMLStripCharFilter(CharReader.get(reader));
- }
- };
-
int numRounds = RANDOM_MULTIPLIER * 10000;
- checkRandomData(random, analyzer, numRounds);
+ checkRandomData(random, newTestAnalyzer(), numRounds);
}
public void testRandomHugeStrings() throws Exception {
- Analyzer analyzer = new ReusableAnalyzerBase() {
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
-
- @Override
- protected Reader initReader(Reader reader) {
- return new HTMLStripCharFilter(CharReader.get(reader));
- }
- };
-
int numRounds = RANDOM_MULTIPLIER * 200;
- checkRandomData(random, analyzer, numRounds, 8192);
+ checkRandomData(random, newTestAnalyzer(), numRounds, 8192);
+ }
+
+ public void testCloseBR() throws Exception {
+ checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), " Secretary)</br> [[M");
}
public void testServerSideIncludes() throws Exception {
@@ -799,9 +789,7 @@ public class HTMLStripCharFilterTest ext
public void testRandomBrokenHTML() throws Exception {
int maxNumElements = 10000;
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
- Reader reader = new HTMLStripCharFilter
- (CharReader.get(new StringReader(text)));
- while (reader.read() != -1);
+ checkAnalysisConsistency(random, newTestAnalyzer(), random.nextBoolean(), text);
}
public void testRandomText() throws Exception {
@@ -840,18 +828,7 @@ public class HTMLStripCharFilterTest ext
}
public void testUTF16Surrogates() throws Exception {
- Analyzer analyzer = new ReusableAnalyzerBase() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
-
- @Override
- protected Reader initReader(Reader reader) {
- return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
- }
- };
+ Analyzer analyzer = newTestAnalyzer();
// Paired surrogates
assertAnalyzesTo(analyzer, " one two ��three",
new String[] { "one", "two", "\uD86C\uDC01three" } );
Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 24 20:10:42 2012
@@ -391,188 +391,193 @@ public abstract class BaseTokenStreamTes
}
- if (VERBOSE) {
- System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
- }
+ checkAnalysisConsistency(random, a, useCharFilter, text);
+ }
+ }
- int remainder = random.nextInt(10);
- Reader reader = new StringReader(text);
- TokenStream ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
- assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
- CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
- OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
- PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
- PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
- TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
- List<String> tokens = new ArrayList<String>();
- List<String> types = new ArrayList<String>();
- List<Integer> positions = new ArrayList<Integer>();
- List<Integer> positionLengths = new ArrayList<Integer>();
- List<Integer> startOffsets = new ArrayList<Integer>();
- List<Integer> endOffsets = new ArrayList<Integer>();
- ts.reset();
-
- // First pass: save away "correct" tokens
- while (ts.incrementToken()) {
- tokens.add(termAtt.toString());
- if (typeAtt != null) types.add(typeAtt.type());
- if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
- if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
- if (offsetAtt != null) {
- startOffsets.add(offsetAtt.startOffset());
- endOffsets.add(offsetAtt.endOffset());
- }
- }
- ts.end();
- ts.close();
- // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
- if (!tokens.isEmpty()) {
-
- // KWTokenizer (for example) can produce a token
- // even when input is length 0:
- if (text.length() != 0) {
-
- // (Optional) second pass: do something evil:
- final int evilness = random.nextInt(50);
- if (evilness == 17) {
- if (VERBOSE) {
- System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
- }
- // Throw an errant exception from the Reader:
+ public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
- MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
- evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
- reader = evilReader;
-
- try {
- // NOTE: some Tokenizers go and read characters
- // when you call .setReader(Reader), eg
- // PatternTokenizer. This is a bit
- // iffy... (really, they should only
- // pull from the Reader when you call
- // .incremenToken(), I think?), but we
- // currently allow it, so, we must call
- // a.tokenStream inside the try since we may
- // hit the exc on init:
- ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
- ts.reset();
- while (ts.incrementToken());
- fail("did not hit exception");
- } catch (RuntimeException re) {
- assertTrue(MockReaderWrapper.isMyEvilException(re));
- }
- try {
- ts.end();
- } catch (AssertionError ae) {
- // Catch & ignore MockTokenizer's
- // anger...
- if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
- // OK
- } else {
- throw ae;
- }
- }
- ts.close();
- } else if (evilness == 7) {
- // Only consume a subset of the tokens:
- final int numTokensToRead = random.nextInt(tokens.size());
- if (VERBOSE) {
- System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
- }
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+ }
+
+ int remainder = random.nextInt(10);
+ Reader reader = new StringReader(text);
+ TokenStream ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+ assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
+ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
+ PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
+ PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
+ TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
+ List<String> tokens = new ArrayList<String>();
+ List<String> types = new ArrayList<String>();
+ List<Integer> positions = new ArrayList<Integer>();
+ List<Integer> positionLengths = new ArrayList<Integer>();
+ List<Integer> startOffsets = new ArrayList<Integer>();
+ List<Integer> endOffsets = new ArrayList<Integer>();
+ ts.reset();
+
+ // First pass: save away "correct" tokens
+ while (ts.incrementToken()) {
+ tokens.add(termAtt.toString());
+ if (typeAtt != null) types.add(typeAtt.type());
+ if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
+ if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
+ if (offsetAtt != null) {
+ startOffsets.add(offsetAtt.startOffset());
+ endOffsets.add(offsetAtt.endOffset());
+ }
+ }
+ ts.end();
+ ts.close();
+ // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
+ if (!tokens.isEmpty()) {
+
+ // KWTokenizer (for example) can produce a token
+ // even when input is length 0:
+ if (text.length() != 0) {
+
+ // (Optional) second pass: do something evil:
+ final int evilness = random.nextInt(50);
+ if (evilness == 17) {
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis w/ exception");
+ }
+ // Throw an errant exception from the Reader:
- reader = new StringReader(text);
- ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+ MockReaderWrapper evilReader = new MockReaderWrapper(random, new StringReader(text));
+ evilReader.throwExcAfterChar(random.nextInt(text.length()+1));
+ reader = evilReader;
+
+ try {
+ // NOTE: some Tokenizers go and read characters
+ // when you call .setReader(Reader), eg
+ // PatternTokenizer. This is a bit
+ // iffy... (really, they should only
+ // pull from the Reader when you call
+ // .incremenToken(), I think?), but we
+ // currently allow it, so, we must call
+ // a.tokenStream inside the try since we may
+ // hit the exc on init:
+ ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(evilReader, remainder) : evilReader);
ts.reset();
- for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
- assertTrue(ts.incrementToken());
+ while (ts.incrementToken());
+ fail("did not hit exception");
+ } catch (RuntimeException re) {
+ assertTrue(MockReaderWrapper.isMyEvilException(re));
+ }
+ try {
+ ts.end();
+ } catch (AssertionError ae) {
+ // Catch & ignore MockTokenizer's
+ // anger...
+ if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+ // OK
+ } else {
+ throw ae;
}
- try {
- ts.end();
- } catch (AssertionError ae) {
- // Catch & ignore MockTokenizer's
- // anger...
- if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
- // OK
- } else {
- throw ae;
- }
+ }
+ ts.close();
+ } else if (evilness == 7) {
+ // Only consume a subset of the tokens:
+ final int numTokensToRead = random.nextInt(tokens.size());
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.size() + " tokens");
+ }
+
+ reader = new StringReader(text);
+ ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+ ts.reset();
+ for(int tokenCount=0;tokenCount<numTokensToRead;tokenCount++) {
+ assertTrue(ts.incrementToken());
+ }
+ try {
+ ts.end();
+ } catch (AssertionError ae) {
+ // Catch & ignore MockTokenizer's
+ // anger...
+ if ("end() called before incrementToken() returned false!".equals(ae.getMessage())) {
+ // OK
+ } else {
+ throw ae;
}
- ts.close();
}
+ ts.close();
}
+ }
- // Final pass: verify clean tokenization matches
- // results from first pass:
+ // Final pass: verify clean tokenization matches
+ // results from first pass:
+
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
+ }
+ reader = new StringReader(text);
+ if (random.nextInt(30) == 7) {
if (VERBOSE) {
- System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
}
- reader = new StringReader(text);
- if (random.nextInt(30) == 7) {
- if (VERBOSE) {
- System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
- }
-
- reader = new MockReaderWrapper(random, reader);
- }
+ reader = new MockReaderWrapper(random, reader);
+ }
- ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
- if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
- // offset + pos + posLength + type
- assertTokenStreamContents(ts,
- tokens.toArray(new String[tokens.size()]),
- toIntArray(startOffsets),
- toIntArray(endOffsets),
- types.toArray(new String[types.size()]),
- toIntArray(positions),
- toIntArray(positionLengths),
- text.length());
- } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
- // offset + pos + type
- assertTokenStreamContents(ts,
- tokens.toArray(new String[tokens.size()]),
- toIntArray(startOffsets),
- toIntArray(endOffsets),
- types.toArray(new String[types.size()]),
- toIntArray(positions),
- null,
- text.length());
- } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
- // offset + pos + posLength
- assertTokenStreamContents(ts,
- tokens.toArray(new String[tokens.size()]),
- toIntArray(startOffsets),
- toIntArray(endOffsets),
- null,
- toIntArray(positions),
- toIntArray(positionLengths),
- text.length());
- } else if (posIncAtt != null && offsetAtt != null) {
- // offset + pos
- assertTokenStreamContents(ts,
- tokens.toArray(new String[tokens.size()]),
- toIntArray(startOffsets),
- toIntArray(endOffsets),
- null,
- toIntArray(positions),
- null,
- text.length());
- } else if (offsetAtt != null) {
- // offset
- assertTokenStreamContents(ts,
- tokens.toArray(new String[tokens.size()]),
- toIntArray(startOffsets),
- toIntArray(endOffsets),
- null,
- null,
- null,
- text.length());
- } else {
- // terms only
- assertTokenStreamContents(ts,
- tokens.toArray(new String[tokens.size()]));
- }
+ ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+ if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+ // offset + pos + posLength + type
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ types.toArray(new String[types.size()]),
+ toIntArray(positions),
+ toIntArray(positionLengths),
+ text.length());
+ } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+ // offset + pos + type
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ types.toArray(new String[types.size()]),
+ toIntArray(positions),
+ null,
+ text.length());
+ } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+ // offset + pos + posLength
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ null,
+ toIntArray(positions),
+ toIntArray(positionLengths),
+ text.length());
+ } else if (posIncAtt != null && offsetAtt != null) {
+ // offset + pos
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ null,
+ toIntArray(positions),
+ null,
+ text.length());
+ } else if (offsetAtt != null) {
+ // offset
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]),
+ toIntArray(startOffsets),
+ toIntArray(endOffsets),
+ null,
+ null,
+ null,
+ text.length());
+ } else {
+ // terms only
+ assertTokenStreamContents(ts,
+ tokens.toArray(new String[tokens.size()]));
}
}
}
Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java?rev=1304904&r1=1304903&r2=1304904&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java Sat Mar 24 20:10:42 2012
@@ -27,10 +27,7 @@ import java.io.OutputStream;
import java.io.PrintStream;
import java.lang.reflect.Method;
import java.nio.CharBuffer;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Random;
+import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@@ -429,12 +426,51 @@ public class _TestUtil {
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
case 21: sb.append("\n"); break;
case 22: sb.append(" ".substring(nextInt(random, 0, 10))); break;
+ case 23: {
+ sb.append("<");
+ if (0 == nextInt(random, 0, 3)) {
+ sb.append(" ".substring(nextInt(random, 1, 10)));
+ }
+ if (0 == nextInt(random, 0, 1)) {
+ sb.append("/");
+ if (0 == nextInt(random, 0, 3)) {
+ sb.append(" ".substring(nextInt(random, 1, 10)));
+ }
+ }
+ switch (nextInt(random, 0, 3)) {
+ case 0: sb.append(randomlyRecaseCodePoints(random, "script")); break;
+ case 1: sb.append(randomlyRecaseCodePoints(random, "style")); break;
+ case 2: sb.append(randomlyRecaseCodePoints(random, "br")); break;
+ // default: append nothing
+ }
+ sb.append(">".substring(nextInt(random, 0, 1)));
+ break;
+ }
default: sb.append(randomSimpleString(random));
}
}
return sb.toString();
}
+ /**
+ * Randomly upcases, downcases, or leaves intact each code point in the given string
+ */
+ public static String randomlyRecaseCodePoints(Random random, String str) {
+ StringBuilder builder = new StringBuilder();
+ int pos = 0;
+ while (pos < str.length()) {
+ int codePoint = str.codePointAt(pos);
+ pos += Character.charCount(codePoint);
+ String codePointSubstring = new String(new int[] { codePoint }, 0, 1);
+ switch (nextInt(random, 0, 2)) {
+ case 0: builder.append(codePointSubstring.toUpperCase()); break;
+ case 1: builder.append(codePointSubstring.toLowerCase()); break;
+ case 2: builder.append(codePointSubstring); // leave intact
+ }
+ }
+ return builder.toString();
+ }
+
private static final int[] blockStarts = {
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400,
0x0500, 0x0530, 0x0590, 0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800,