You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/12/16 05:37:48 UTC
svn commit: r1215038 - in /lucene/dev/trunk:
lucene/src/test-framework/java/org/apache/lucene/analysis/
lucene/src/test-framework/java/org/apache/lucene/util/
modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/
modules/analysis/comm...
Author: rmuir
Date: Fri Dec 16 04:37:47 2011
New Revision: 1215038
URL: http://svn.apache.org/viewvc?rev=1215038&view=rev
Log:
LUCENE-2208: improve charfilter offset testing
Modified:
lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
Modified: lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Fri Dec 16 04:37:47 2011
@@ -246,7 +246,7 @@ public abstract class BaseTokenStreamTes
}
// simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
-
+ // TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers.
public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
checkRandomData(random, a, iterations, 20);
}
@@ -254,13 +254,16 @@ public abstract class BaseTokenStreamTes
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
for (int i = 0; i < iterations; i++) {
String text;
- switch(_TestUtil.nextInt(random, 0, 3)) {
+ switch(_TestUtil.nextInt(random, 0, 4)) {
case 0:
text = _TestUtil.randomSimpleString(random);
break;
case 1:
text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
break;
+ case 2:
+ text = _TestUtil.randomHtmlishString(random, maxWordLength);
+ break;
default:
text = _TestUtil.randomUnicodeString(random, maxWordLength);
}
Modified: lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java Fri Dec 16 04:37:47 2011
@@ -243,6 +243,31 @@ public class _TestUtil {
}
}
}
+
+ // TODO: make this more evil
+ public static String randomHtmlishString(Random random, int numElements) {
+ final int end = random.nextInt(numElements);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < end; i++) {
+ int val = random.nextInt(10);
+ switch(val) {
+ case 0: sb.append("<p>"); break;
+ case 1: sb.append("</p>"); break;
+ case 2: sb.append("<!--"); break;
+ case 3: sb.append("-->"); break;
+ case 4: sb.append("&#"); break;
+ case 5: sb.append(";"); break;
+ case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
+ default:
+ sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+ }
+ }
+ return sb.toString();
+ }
private static final int[] blockStarts = {
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400,
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Fri Dec 16 04:37:47 2011
@@ -26,21 +26,15 @@ import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.junit.Ignore;
-import org.apache.lucene.util.LuceneTestCase;
+public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
-public class HTMLStripCharFilterTest extends LuceneTestCase {
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- }
-
- @Override
- public void tearDown() throws Exception {
- super.tearDown();
- }
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
//
public void test() throws IOException {
@@ -260,5 +254,24 @@ public class HTMLStripCharFilterTest ext
// test backtracking
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
}
+
+ @Ignore("broken offsets: see LUCENE-2208")
+ public void testRandom() throws Exception {
+ Analyzer analyzer = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+ }
+ };
+
+ int numRounds = RANDOM_MULTIPLIER * 10000;
+ checkRandomData(random, analyzer, numRounds);
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java Fri Dec 16 04:37:47 2011
@@ -17,13 +17,16 @@
package org.apache.lucene.analysis.charfilter;
+import java.io.Reader;
import java.io.StringReader;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@@ -65,55 +68,55 @@ public class TestMappingCharFilter exten
public void testNothingChange() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1});
+ assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1});
+ assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1});
+ assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to3() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1});
+ assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
}
public void test2to4() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2});
+ assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
}
public void test2to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2});
+ assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
}
public void test3to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3});
+ assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
}
public void test4to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4});
+ assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
}
public void test5to0() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
- assertTokenStreamContents(ts, new String[0]);
+ assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}
//
@@ -135,12 +138,14 @@ public class TestMappingCharFilter exten
// aa,20,22 => a,20,22
//
public void testTokenStream() throws Exception {
- CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
+ String testString = "h i j k ll cccc bbb aa";
+ CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
new int[]{0,2,4,6,8,11,16,20},
- new int[]{1,3,5,7,10,15,19,22}
+ new int[]{1,3,5,7,10,15,19,22},
+ testString.length()
);
}
@@ -155,13 +160,34 @@ public class TestMappingCharFilter exten
// ll,5,7 => llllllll,5,7
// h,8,9 => i,8,9
public void testChained() throws Exception {
+ String testString = "aaaa ll h";
CharStream cs = new MappingCharFilter( normMap,
- new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
+ new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"a","llllllll","i"},
new int[]{0,5,8},
- new int[]{4,7,9}
+ new int[]{4,7,9},
+ testString.length()
);
}
+
+ public void testRandom() throws Exception {
+ Analyzer analyzer = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+
+ @Override
+ protected Reader initReader(Reader reader) {
+ return new MappingCharFilter(normMap, CharReader.get(reader));
+ }
+ };
+
+ int numRounds = RANDOM_MULTIPLIER * 10000;
+ checkRandomData(random, analyzer, numRounds);
+ }
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java Fri Dec 16 04:37:47 2011
@@ -43,7 +43,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
- new int[] { 4, 7, 13 });
+ new int[] { 4, 7, 13 },
+ BLOCK.length());
}
// 012345678
@@ -67,7 +68,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
- new int[] { 8 });
+ new int[] { 8 },
+ BLOCK.length());
}
// 11111
@@ -82,7 +84,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa##bb###cc", "dd" },
new int[] { 0, 9 },
- new int[] { 8, 11 });
+ new int[] { 8, 11 },
+ BLOCK.length());
}
// 01234567
@@ -96,7 +99,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa", "aa" },
new int[] { 1, 4 },
- new int[] { 2, 5 });
+ new int[] { 2, 5 },
+ BLOCK.length());
}
// 11111
@@ -111,7 +115,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa#bb", "dd" },
new int[] { 0, 12 },
- new int[] { 11, 14 });
+ new int[] { 11, 14 },
+ BLOCK.length());
}
// 111111111122222222223333
@@ -126,7 +131,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
- new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 });
+ new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
+ BLOCK.length());
}
// 11111111112222222222333333333
@@ -141,7 +147,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
- new int[] { 7, 10, 14, 20, 24, 27, 35, 38 });
+ new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
+ BLOCK.length());
}
// 11111111112222222222333333333
@@ -158,7 +165,8 @@ public class TestPatternReplaceCharFilte
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
- new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 });
+ new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
+ BLOCK.length());
}
private Pattern pattern( String p ){
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java Fri Dec 16 04:37:47 2011
@@ -84,14 +84,16 @@ public class TestPatternTokenizer extend
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
new int[] { 0, 13, 26, 29 },
- new int[] { 12, 25, 28, 33 });
+ new int[] { 12, 25, 28, 33 },
+ INPUT.length());
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
- new int[] { 12, 25 });
+ new int[] { 12, 25 },
+ INPUT.length());
}
/**