You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2012/02/27 14:13:10 UTC
svn commit: r1294141 - in /lucene/dev/trunk:
lucene/test-framework/src/java/org/apache/lucene/analysis/
lucene/test-framework/src/java/org/apache/lucene/util/ modules/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/
module...
Author: dweiss
Date: Mon Feb 27 13:13:10 2012
New Revision: 1294141
URL: http://svn.apache.org/viewvc?rev=1294141&view=rev
Log:
LUCENE-3820: Wrong trailing index calculation in PatternReplaceCharFilter.
Modified:
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
lucene/dev/trunk/modules/analysis/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1294141&r1=1294140&r2=1294141&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Mon Feb 27 13:13:10 2012
@@ -246,15 +246,22 @@ public abstract class BaseTokenStreamTes
assertAnalyzesToReuse(a, input, new String[]{expected});
}
- // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
- // TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers.
+ /** utility method for blasting tokenstreams with data to make sure they don't do anything crazy */
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
- checkRandomData(random, a, iterations, 20);
+ checkRandomData(random, a, iterations, false);
+ }
+
+ /**
+ * utility method for blasting tokenstreams with data to make sure they don't do anything crazy
+ * @param simple true if only ascii strings will be used (try to avoid)
+ */
+ public static void checkRandomData(Random random, Analyzer a, int iterations, boolean simple) throws IOException {
+ checkRandomData(random, a, iterations, 20, simple);
// now test with multiple threads
int numThreads = _TestUtil.nextInt(random, 4, 8);
Thread threads[] = new Thread[numThreads];
for (int i = 0; i < threads.length; i++) {
- threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations);
+ threads[i] = new AnalysisThread(new Random(random.nextLong()), a, iterations, simple);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
@@ -272,11 +279,13 @@ public abstract class BaseTokenStreamTes
final int iterations;
final Random random;
final Analyzer a;
+ final boolean simple;
- AnalysisThread(Random random, Analyzer a, int iterations) {
+ AnalysisThread(Random random, Analyzer a, int iterations, boolean simple) {
this.random = random;
this.a = a;
this.iterations = iterations;
+ this.simple = simple;
}
@Override
@@ -284,32 +293,36 @@ public abstract class BaseTokenStreamTes
try {
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
- checkRandomData(random, a, iterations, 20);
+ checkRandomData(random, a, iterations, 20, simple);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
- public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
- checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
+ public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple) throws IOException {
+ checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean(), simple);
}
- public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
+ public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
- switch(_TestUtil.nextInt(random, 0, 4)) {
- case 0:
- text = _TestUtil.randomSimpleString(random);
- break;
- case 1:
- text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
- break;
- case 2:
- text = _TestUtil.randomHtmlishString(random, maxWordLength);
- break;
- default:
- text = _TestUtil.randomUnicodeString(random, maxWordLength);
+ if (simple) {
+ text = random.nextBoolean() ? _TestUtil.randomSimpleString(random) : _TestUtil.randomHtmlishString(random, maxWordLength);
+ } else {
+ switch(_TestUtil.nextInt(random, 0, 4)) {
+ case 0:
+ text = _TestUtil.randomSimpleString(random);
+ break;
+ case 1:
+ text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
+ break;
+ case 2:
+ text = _TestUtil.randomHtmlishString(random, maxWordLength);
+ break;
+ default:
+ text = _TestUtil.randomUnicodeString(random, maxWordLength);
+ }
}
if (VERBOSE) {
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java?rev=1294141&r1=1294140&r2=1294141&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java Mon Feb 27 13:13:10 2012
@@ -250,6 +250,36 @@ public class _TestUtil {
}
}
+ /**
+ * Returns a String thats "regexpish" (contains lots of operators typically found in regular expressions)
+ * If you call this enough times, you might get a valid regex!
+ */
+ public static String randomRegexpishString(Random r) {
+ final int end = r.nextInt(20);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ int t = r.nextInt(11);
+ if (t == 0) {
+ buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
+ }
+ else if (1 == t) buffer[i] = '.';
+ else if (2 == t) buffer[i] = '?';
+ else if (3 == t) buffer[i] = '*';
+ else if (4 == t) buffer[i] = '+';
+ else if (5 == t) buffer[i] = '(';
+ else if (6 == t) buffer[i] = ')';
+ else if (7 == t) buffer[i] = '-';
+ else if (8 == t) buffer[i] = '[';
+ else if (9 == t) buffer[i] = ']';
+ else if (10 == t) buffer[i] = '|';
+ }
+ return new String(buffer, 0, end);
+ }
+
private static final String[] HTML_CHAR_ENTITIES = {
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
Modified: lucene/dev/trunk/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/CHANGES.txt?rev=1294141&r1=1294140&r2=1294141&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/analysis/CHANGES.txt Mon Feb 27 13:13:10 2012
@@ -7,6 +7,9 @@ http://s.apache.org/luceneversions
API Changes
+ * LUCENE-3820: Deprecated constructors accepting pattern matching bounds. The input
+ is buffered and matched in one pass. (Dawid Weiss)
+
* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
of the pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
@@ -34,6 +37,11 @@ API Changes
and sometimes different depending on the type of set, and ultimately a CharArraySet
or CharArrayMap was always used anyway. (Robert Muir)
+Bug fixes
+
+ * LUCENE-3820: PatternReplaceCharFilter could return invalid token positions.
+ (Dawid Weiss)
+
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java?rev=1294141&r1=1294140&r2=1294141&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java Mon Feb 27 13:13:10 2012
@@ -18,12 +18,13 @@
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
-import java.util.LinkedList;
+import java.io.Reader;
+import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.lucene.analysis.charfilter.BaseCharFilter;
import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
/**
* CharFilter that uses a regular expression for the target of replace string.
@@ -48,147 +49,88 @@ import org.apache.lucene.analysis.CharSt
* @since Solr 1.5
*/
public class PatternReplaceCharFilter extends BaseCharFilter {
+ @Deprecated
+ public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
private final Pattern pattern;
private final String replacement;
- private final int maxBlockChars;
- private final String blockDelimiters;
- public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
+ private Reader transformedInput;
- private LinkedList<Character> buffer;
- private int nextCharCounter;
- private char[] blockBuffer;
- private int blockBufferLength;
- private String replaceBlockBuffer;
- private int replaceBlockBufferOffset;
-
- public PatternReplaceCharFilter( Pattern pattern, String replacement, CharStream in ){
- this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
- }
-
- public PatternReplaceCharFilter( Pattern pattern, String replacement,
- int maxBlockChars, CharStream in ){
- this( pattern, replacement, maxBlockChars, null, in );
- }
-
- public PatternReplaceCharFilter( Pattern pattern, String replacement,
- String blockDelimiters, CharStream in ){
- this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
- }
-
- public PatternReplaceCharFilter( Pattern pattern, String replacement,
- int maxBlockChars, String blockDelimiters, CharStream in ){
- super( in );
+ public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
+ super(in);
this.pattern = pattern;
this.replacement = replacement;
- if( maxBlockChars < 1 )
- throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but it is " + maxBlockChars );
- this.maxBlockChars = maxBlockChars;
- this.blockDelimiters = blockDelimiters;
- blockBuffer = new char[maxBlockChars];
- }
-
- private boolean prepareReplaceBlock() throws IOException {
- while( true ){
- if( replaceBlockBuffer != null && replaceBlockBuffer.length() > replaceBlockBufferOffset )
- return true;
- // prepare block buffer
- blockBufferLength = 0;
- while( true ){
- int c = nextChar();
- if( c == -1 ) break;
- blockBuffer[blockBufferLength++] = (char)c;
- // end of block?
- boolean foundDelimiter =
- ( blockDelimiters != null ) &&
- ( blockDelimiters.length() > 0 ) &&
- blockDelimiters.indexOf( c ) >= 0;
- if( foundDelimiter ||
- blockBufferLength >= maxBlockChars ) break;
- }
- // block buffer available?
- if( blockBufferLength == 0 ) return false;
- replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
- replaceBlockBufferOffset = 0;
- }
}
- @Override
- public int read() throws IOException {
- while( prepareReplaceBlock() ){
- return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
- }
- return -1;
+ @Deprecated
+ public PatternReplaceCharFilter(Pattern pattern, String replacement,
+ int maxBlockChars, String blockDelimiter, CharStream in) {
+ this(pattern, replacement, in);
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
- char[] tmp = new char[len];
- int l = input.read(tmp, 0, len);
- if (l != -1) {
- for(int i = 0; i < l; i++)
- pushLastChar(tmp[i]);
- }
- l = 0;
- for(int i = off; i < off + len; i++) {
- int c = read();
- if (c == -1) break;
- cbuf[i] = (char) c;
- l++;
+ // Buffer all input on the first call.
+ if (transformedInput == null) {
+ StringBuilder buffered = new StringBuilder();
+ char [] temp = new char [1024];
+ for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
+ buffered.append(temp, 0, cnt);
+ }
+ transformedInput = new StringReader(processPattern(buffered).toString());
}
- return l == 0 ? -1 : l;
- }
- private int nextChar() throws IOException {
- if (buffer != null && !buffer.isEmpty()) {
- nextCharCounter++;
- return buffer.removeFirst().charValue();
- }
- int c = input.read();
- if( c != -1 )
- nextCharCounter++;
- return c;
+ return transformedInput.read(cbuf, off, len);
}
- private void pushLastChar(int c) {
- if (buffer == null) {
- buffer = new LinkedList<Character>();
- }
- buffer.addLast(new Character((char) c));
- }
-
- String getReplaceBlock( String block ){
- char[] blockChars = block.toCharArray();
- return getReplaceBlock( blockChars, 0, blockChars.length );
+ @Override
+ protected int correct(int currentOff) {
+ return Math.max(0, super.correct(currentOff));
}
-
- String getReplaceBlock( char block[], int offset, int length ){
- StringBuffer replaceBlock = new StringBuffer();
- String sourceBlock = new String( block, offset, length );
- Matcher m = pattern.matcher( sourceBlock );
- int lastMatchOffset = 0, lastDiff = 0;
- while( m.find() ){
- m.appendReplacement( replaceBlock, replacement );
- // record cumulative diff for the offset correction
- int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) - lastMatchOffset );
- if (diff != 0) {
- int prevCumulativeDiff = getLastCumulativeDiff();
- if (diff > 0) {
- for(int i = 0; i < diff; i++){
- addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
- prevCumulativeDiff - 1 - i);
- }
+
+ /**
+ * Replace pattern in input and mark correction offsets.
+ */
+ CharSequence processPattern(CharSequence input) {
+ final Matcher m = pattern.matcher(input);
+
+ final StringBuffer cumulativeOutput = new StringBuffer();
+ int cumulative = 0;
+ int lastMatchEnd = 0;
+ while (m.find()) {
+ final int groupSize = m.end() - m.start();
+ final int skippedSize = m.start() - lastMatchEnd;
+ lastMatchEnd = m.end();
+
+ final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize;
+ m.appendReplacement(cumulativeOutput, replacement);
+ // Matcher doesn't tell us how many characters have been appended before the replacement.
+ // So we need to calculate it. Skipped characters have been added as part of appendReplacement.
+ final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement;
+
+ if (groupSize != replacementSize) {
+ if (replacementSize < groupSize) {
+ // The replacement is smaller.
+ // Add the 'backskip' to the next index after the replacement (this is possibly
+ // after the end of string, but it's fine -- it just means the last character
+ // of the replaced block doesn't reach the end of the original string.
+ cumulative += groupSize - replacementSize;
+ int atIndex = lengthBeforeReplacement + replacementSize;
+ // System.err.println(atIndex + "!" + cumulative);
+ addOffCorrectMap(atIndex, cumulative);
} else {
- addOffCorrectMap(nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
- prevCumulativeDiff - diff);
+ // The replacement is larger. Every new index needs to point to the last
+ // element of the original group (if any).
+ for (int i = groupSize; i < replacementSize; i++) {
+ addOffCorrectMap(lengthBeforeReplacement + i, --cumulative);
+ // System.err.println((lengthBeforeReplacement + i) + " " + cumulative);
+ }
}
}
- // save last offsets
- lastMatchOffset = m.end( 0 );
- lastDiff = diff;
}
- // copy remaining of the part of source block
- m.appendTail( replaceBlock );
- return replaceBlock.toString();
+
+ // Append the remaining output, no further changes to indices.
+ m.appendTail(cumulativeOutput);
+ return cumulativeOutput;
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java?rev=1294141&r1=1294140&r2=1294141&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java Mon Feb 27 13:13:10 2012
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -29,12 +30,107 @@ import org.apache.lucene.analysis.CharSt
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util._TestUtil;
/**
* Tests {@link PatternReplaceCharFilter}
*/
public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
-
+ public void testFailingDot() throws IOException {
+ checkOutput(
+ "A. .B.", "\\.[\\s]*", ".",
+ "A..B.",
+ "A..B.");
+ }
+
+ public void testLongerReplacement() throws IOException {
+ checkOutput(
+ "XXabcZZabcYY", "abc", "abcde",
+ "XXabcdeZZabcdeYY",
+ "XXabcccZZabcccYY");
+ checkOutput(
+ "XXabcabcYY", "abc", "abcde",
+ "XXabcdeabcdeYY",
+ "XXabcccabcccYY");
+ checkOutput(
+ "abcabcYY", "abc", "abcde",
+ "abcdeabcdeYY",
+ "abcccabcccYY");
+ checkOutput(
+ "YY", "^", "abcde",
+ "abcdeYY",
+ // Should be: "-----YY" but we're enforcing non-negative offsets.
+ "YYYYYYY");
+ checkOutput(
+ "YY", "$", "abcde",
+ "YYabcde",
+ "YYYYYYY");
+ checkOutput(
+ "XYZ", ".", "abc",
+ "abcabcabc",
+ "XXXYYYZZZ");
+ checkOutput(
+ "XYZ", ".", "$0abc",
+ "XabcYabcZabc",
+ "XXXXYYYYZZZZ");
+ }
+
+ public void testShorterReplacement() throws IOException {
+ checkOutput(
+ "XXabcZZabcYY", "abc", "xy",
+ "XXxyZZxyYY",
+ "XXabZZabYY");
+ checkOutput(
+ "XXabcabcYY", "abc", "xy",
+ "XXxyxyYY",
+ "XXababYY");
+ checkOutput(
+ "abcabcYY", "abc", "xy",
+ "xyxyYY",
+ "ababYY");
+ checkOutput(
+ "abcabcYY", "abc", "",
+ "YY",
+ "YY");
+ checkOutput(
+ "YYabcabc", "abc", "",
+ "YY",
+ "YY");
+ }
+
+ private void checkOutput(String input, String pattern, String replacement,
+ String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
+ CharStream cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
+ CharReader.get(new StringReader(input)));
+
+ StringBuilder output = new StringBuilder();
+ for (int chr = cs.read(); chr > 0; chr = cs.read()) {
+ output.append((char) chr);
+ }
+
+ StringBuilder indexMatched = new StringBuilder();
+ for (int i = 0; i < output.length(); i++) {
+ indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
+ }
+
+ boolean outputGood = expectedOutput.equals(output.toString());
+ boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
+
+ if (!outputGood || !indexMatchedGood || false) {
+ System.out.println("Pattern : " + pattern);
+ System.out.println("Replac. : " + replacement);
+ System.out.println("Input : " + input);
+ System.out.println("Output : " + output);
+ System.out.println("Expected: " + expectedOutput);
+ System.out.println("Output/i: " + indexMatched);
+ System.out.println("Expected: " + expectedIndexMatchedOutput);
+ System.out.println();
+ }
+
+ assertTrue("Output doesn't match.", outputGood);
+ assertTrue("Index-matched output doesn't match.", indexMatchedGood);
+ }
+
// 1111
// 01234567890123
// this is test.
@@ -142,9 +238,13 @@ public class TestPatternReplaceCharFilte
// 012345678901234567890123456789012345678
// aa bb cc --- aa bb aa. bb aa bb cc
// aa##bb cc --- aa##bb aa. bb aa##bb cc
+
+ // aa bb cc --- aa bbbaa. bb aa b cc
+
public void test2blocksMultiMatches() throws IOException {
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
- CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
+
+ CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
@@ -160,10 +260,10 @@ public class TestPatternReplaceCharFilte
// aa b - c . --- b aa . c c b
public void testChain() throws IOException {
final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
- CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".",
+ CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
CharReader.get( new StringReader( BLOCK ) ) );
- cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
- cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
+ cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs );
+ cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
@@ -178,18 +278,33 @@ public class TestPatternReplaceCharFilte
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, tokenizer);
- }
+ int numPatterns = atLeast(100);
+ for (int i = 0; i < numPatterns; i++) {
+ final Pattern p = randomPattern();
+ final String replacement = _TestUtil.randomSimpleString(random);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
- @Override
- protected Reader initReader(Reader reader) {
- return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new PatternReplaceCharFilter(p, replacement, CharReader.get(reader));
+ }
+ };
+ checkRandomData(random, a, 1000*RANDOM_MULTIPLIER, true); // only ascii
+ }
+ }
+
+ public static Pattern randomPattern() {
+ while (true) {
+ try {
+ return Pattern.compile(_TestUtil.randomRegexpishString(random));
+ } catch (PatternSyntaxException ignored) {
+ // if at first you don't succeed...
}
- };
- checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ }
}
-}
+ }