You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/12/16 05:37:48 UTC

svn commit: r1215038 - in /lucene/dev/trunk: lucene/src/test-framework/java/org/apache/lucene/analysis/ lucene/src/test-framework/java/org/apache/lucene/util/ modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/ modules/analysis/comm...

Author: rmuir
Date: Fri Dec 16 04:37:47 2011
New Revision: 1215038

URL: http://svn.apache.org/viewvc?rev=1215038&view=rev
Log:
LUCENE-2208: improve charfilter offset testing

Modified:
    lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java

Modified: lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Fri Dec 16 04:37:47 2011
@@ -246,7 +246,7 @@ public abstract class BaseTokenStreamTes
   }
   
   // simple utility method for blasting tokenstreams with data to make sure they don't do anything crazy
-
+  // TODO: add a MockCharStream, and use it here too, to ensure that correctOffset etc is being done by tokenizers.
   public static void checkRandomData(Random random, Analyzer a, int iterations) throws IOException {
     checkRandomData(random, a, iterations, 20);
   }
@@ -254,13 +254,16 @@ public abstract class BaseTokenStreamTes
   public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
     for (int i = 0; i < iterations; i++) {
       String text;
-      switch(_TestUtil.nextInt(random, 0, 3)) {
+      switch(_TestUtil.nextInt(random, 0, 4)) {
         case 0: 
           text = _TestUtil.randomSimpleString(random);
           break;
         case 1:
           text = _TestUtil.randomRealisticUnicodeString(random, maxWordLength);
           break;
+        case 2:
+          text = _TestUtil.randomHtmlishString(random, maxWordLength);
+          break;
         default:
           text = _TestUtil.randomUnicodeString(random, maxWordLength);
       }

Modified: lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/trunk/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java Fri Dec 16 04:37:47 2011
@@ -243,6 +243,31 @@ public class _TestUtil {
       }
     }
   }
+  
+  // TODO: make this more evil
+  public static String randomHtmlishString(Random random, int numElements) {
+    final int end = random.nextInt(numElements);
+    if (end == 0) {
+      // allow 0 length
+      return "";
+    }
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < end; i++) {
+      int val = random.nextInt(10);
+      switch(val) {
+        case 0: sb.append("<p>"); break;
+        case 1: sb.append("</p>"); break;
+        case 2: sb.append("<!--"); break;
+        case 3: sb.append("-->"); break;
+        case 4: sb.append("&#"); break;
+        case 5: sb.append(";"); break;
+        case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
+        default:
+          sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+      }
+    }
+    return sb.toString();
+  }
 
   private static final int[] blockStarts = {
     0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, 

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java Fri Dec 16 04:37:47 2011
@@ -26,21 +26,15 @@ import java.io.StringReader;
 import java.util.HashSet;
 import java.util.Set;
 
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.junit.Ignore;
 
-import org.apache.lucene.util.LuceneTestCase;
+public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
 
-public class HTMLStripCharFilterTest extends LuceneTestCase {
-
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-  }
-
-  @Override
-  public void tearDown() throws Exception {
-    super.tearDown();
-  }
   //this is some text  here is a  link  and another  link . This is an entity: & plus a <.  Here is an &
   //
   public void test() throws IOException {
@@ -260,5 +254,24 @@ public class HTMLStripCharFilterTest ext
     // test backtracking
     doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
   }
+  
+  @Ignore("broken offsets: see LUCENE-2208")
+  public void testRandom() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
 
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+      }
+    };
+    
+    int numRounds = RANDOM_MULTIPLIER * 10000;
+    checkRandomData(random, analyzer, numRounds);
+  }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java Fri Dec 16 04:37:47 2011
@@ -17,13 +17,16 @@
 
 package org.apache.lucene.analysis.charfilter;
 
+import java.io.Reader;
 import java.io.StringReader;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 
 public class TestMappingCharFilter extends BaseTokenStreamTestCase {
 
@@ -65,55 +68,55 @@ public class TestMappingCharFilter exten
   public void testNothingChange() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1});
+    assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
   }
 
   public void test1to1() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1});
+    assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
   }
 
   public void test1to2() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1});
+    assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
   }
 
   public void test1to3() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1});
+    assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
   }
 
   public void test2to4() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2});
+    assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
   }
 
   public void test2to1() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2});
+    assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
   }
 
   public void test3to1() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3});
+    assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
   }
 
   public void test4to2() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4});
+    assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
   }
 
   public void test5to0() throws Exception {
     CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
-    assertTokenStreamContents(ts, new String[0]);
+    assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
   }
 
   //
@@ -135,12 +138,14 @@ public class TestMappingCharFilter exten
   //   aa,20,22 =>    a,20,22
   //
   public void testTokenStream() throws Exception {
-    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
+    String testString = "h i j k ll cccc bbb aa";
+    CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     assertTokenStreamContents(ts,
       new String[]{"i","i","jj","kkk","llll","cc","b","a"},
       new int[]{0,2,4,6,8,11,16,20},
-      new int[]{1,3,5,7,10,15,19,22}
+      new int[]{1,3,5,7,10,15,19,22},
+      testString.length()
     );
   }
 
@@ -155,13 +160,34 @@ public class TestMappingCharFilter exten
   //   ll,5,7 => llllllll,5,7
   //    h,8,9 => i,8,9
   public void testChained() throws Exception {
+    String testString = "aaaa ll h";
     CharStream cs = new MappingCharFilter( normMap,
-        new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
+        new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) ) );
     TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
     assertTokenStreamContents(ts,
       new String[]{"a","llllllll","i"},
       new int[]{0,5,8},
-      new int[]{4,7,9}
+      new int[]{4,7,9},
+      testString.length()
     );
   }
+  
+  public void testRandom() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MappingCharFilter(normMap, CharReader.get(reader));
+      }
+    };
+    
+    int numRounds = RANDOM_MULTIPLIER * 10000;
+    checkRandomData(random, analyzer, numRounds);
+  }
 }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java Fri Dec 16 04:37:47 2011
@@ -43,7 +43,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "this", "is", "test." },
         new int[] { 0, 5, 8 },
-        new int[] { 4, 7, 13 });
+        new int[] { 4, 7, 13 }, 
+        BLOCK.length());
   }
   
   // 012345678
@@ -67,7 +68,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa#bb#cc" },
         new int[] { 0 },
-        new int[] { 8 });
+        new int[] { 8 }, 
+        BLOCK.length());
   }
 
   //           11111
@@ -82,7 +84,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa##bb###cc", "dd" },
         new int[] { 0, 9 },
-        new int[] { 8, 11 });
+        new int[] { 8, 11 },
+        BLOCK.length());
   }
 
   // 01234567
@@ -96,7 +99,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa", "aa" },
         new int[] { 1, 4 },
-        new int[] { 2, 5 });
+        new int[] { 2, 5 },
+        BLOCK.length());
   }
 
   //           11111
@@ -111,7 +115,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa#bb", "dd" },
         new int[] { 0, 12 },
-        new int[] { 11, 14 });
+        new int[] { 11, 14 },
+        BLOCK.length());
   }
 
   //           111111111122222222223333
@@ -126,7 +131,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
         new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
-        new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 });
+        new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
+        BLOCK.length());
   }
 
   //           11111111112222222222333333333
@@ -141,7 +147,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
         new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
-        new int[] { 7, 10, 14, 20, 24, 27, 35, 38 });
+        new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
+        BLOCK.length());
   }
 
   //           11111111112222222222333333333
@@ -158,7 +165,8 @@ public class TestPatternReplaceCharFilte
     assertTokenStreamContents(ts,
         new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
         new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
-        new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 });
+        new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
+        BLOCK.length());
   }
   
   private Pattern pattern( String p ){

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java?rev=1215038&r1=1215037&r2=1215038&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java Fri Dec 16 04:37:47 2011
@@ -84,14 +84,16 @@ public class TestPatternTokenizer extend
     assertTokenStreamContents(stream,
         new String[] { "Günther", "Günther", "is", "here" },
         new int[] { 0, 13, 26, 29 },
-        new int[] { 12, 25, 28, 33 });
+        new int[] { 12, 25, 28, 33 },
+        INPUT.length());
     
     charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
     stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0);
     assertTokenStreamContents(stream,
         new String[] { "Günther", "Günther" },
         new int[] { 0, 13 },
-        new int[] { 12, 25 });
+        new int[] { 12, 25 },
+        INPUT.length());
   }
   
   /**