You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/26 14:10:08 UTC

svn commit: r916666 [4/16] - in /lucene/java/branches/flex_1458: ./ contrib/ contrib/analyzers/common/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/ contrib/analyzers/c...

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Fri Feb 26 13:09:54 2010
@@ -17,12 +17,14 @@
  * limitations under the License.
  */
 
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
+import java.io.IOException;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -139,6 +141,34 @@
     checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
   }
   
+  public void testStemExclusionTableBWCompat() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("Brasília");
+    BrazilianStemFilter filter = new BrazilianStemFilter(
+        new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader("Brasília Brasilia")), set);
+    assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
+  }
+
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("Brasília");
+    BrazilianStemFilter filter = new BrazilianStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+            "Brasília Brasilia")), set));
+    assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
+  }
+
+  public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("Brasília");
+    CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set1.add("Brasilia");
+    BrazilianStemFilter filter = new BrazilianStemFilter(
+        new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+            "Brasília Brasilia")), set), set1);
+    assertTokenStreamContents(filter, new String[] { "brasília", "brasilia" });
+  }
+  
   /* 
    * Test that changes to the exclusion table are applied immediately
    * when using reusable token streams.

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Fri Feb 26 13:09:54 2010
@@ -18,14 +18,9 @@
  */
 
 import java.io.IOException;
-import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.Version;
 
 public class TestCJKTokenizer extends BaseTokenStreamTestCase {
@@ -47,33 +42,33 @@
   }
 
   public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
-    CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
-    TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
-    TypeAttribute typeAtt = tokenizer.getAttribute(TypeAttribute.class);
+    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
+    String terms[] = new String[out_tokens.length];
+    int startOffsets[] = new int[out_tokens.length];
+    int endOffsets[] = new int[out_tokens.length];
+    String types[] = new String[out_tokens.length];
     for (int i = 0; i < out_tokens.length; i++) {
-      assertTrue(tokenizer.incrementToken());
-      assertEquals(termAtt.term(), out_tokens[i].termText);
-      assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
-      assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
-      assertEquals(typeAtt.type(), out_tokens[i].type);
+      terms[i] = out_tokens[i].termText;
+      startOffsets[i] = out_tokens[i].start;
+      endOffsets[i] = out_tokens[i].end;
+      types[i] = out_tokens[i].type;
     }
-    assertFalse(tokenizer.incrementToken());
+    assertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
   }
   
   public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
-    TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str));
-    TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
-    TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
+    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
+    String terms[] = new String[out_tokens.length];
+    int startOffsets[] = new int[out_tokens.length];
+    int endOffsets[] = new int[out_tokens.length];
+    String types[] = new String[out_tokens.length];
     for (int i = 0; i < out_tokens.length; i++) {
-      assertTrue(ts.incrementToken());
-      assertEquals(termAtt.term(), out_tokens[i].termText);
-      assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
-      assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
-      assertEquals(typeAtt.type(), out_tokens[i].type);
+      terms[i] = out_tokens[i].termText;
+      startOffsets[i] = out_tokens[i].start;
+      endOffsets[i] = out_tokens[i].end;
+      types[i] = out_tokens[i].type;
     }
-    assertFalse(ts.incrementToken());
+    assertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
   }
   
   public void testJa1() throws IOException {
@@ -219,13 +214,8 @@
   
   public void testTokenStream() throws Exception {
     Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
-    TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
-    TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
-    assertTrue(ts.incrementToken());
-    assertEquals("\u4e00\u4e01", termAtt.term());
-    assertTrue(ts.incrementToken());
-    assertEquals("\u4e01\u4e02", termAtt.term());
-    assertFalse(ts.incrementToken());
+    assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", 
+        new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
   }
   
   public void testReusableTokenStream() throws Exception {
@@ -261,4 +251,24 @@
     };
     checkCJKTokenReusable(analyzer, str, out_tokens2);
   }
+  
+  /**
+   * LUCENE-2207: wrong offset calculated by end() 
+   */
+  public void testFinalOffset() throws IOException {
+    checkCJKToken("あい", new TestToken[] { 
+        newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+    checkCJKToken("あい   ", new TestToken[] { 
+        newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+    checkCJKToken("test", new TestToken[] { 
+        newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+    checkCJKToken("test   ", new TestToken[] { 
+        newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+    checkCJKToken("あいtest", new TestToken[] {
+        newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+        newToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+    checkCJKToken("testあい    ", new TestToken[] { 
+        newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
+        newToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java Fri Feb 26 13:09:54 2010
@@ -24,11 +24,13 @@
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.Version;
 
 
+/** @deprecated Remove this test when ChineseAnalyzer is removed. */
+@Deprecated
 public class TestChineseTokenizer extends BaseTokenStreamTestCase
 {
     public void testOtherLetterOffset() throws IOException
@@ -78,7 +80,7 @@
     private class JustChineseFilterAnalyzer extends Analyzer {
       @Override
       public TokenStream tokenStream(String fieldName, Reader reader) {
-        return new ChineseFilter(new WhitespaceTokenizer(reader));
+        return new ChineseFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader));
       }
     }
     

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -28,6 +28,7 @@
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
 
 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
   static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
@@ -46,8 +47,8 @@
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
         .getHyphenationTree(reader);
 
-    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
-        new WhitespaceTokenizer(new StringReader(
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(Version.LUCENE_CURRENT, 
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
             "min veninde som er lidt af en læsehest")), hyphenator,
         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
@@ -66,8 +67,8 @@
         .getHyphenationTree(reader);
 
     // the word basket will not be added due to the longest match option
-    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
-        new WhitespaceTokenizer(new StringReader(
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(Version.LUCENE_CURRENT, 
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
             "basketballkurv")), hyphenator, dict,
         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
@@ -83,8 +84,8 @@
         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
         "Sko", "Vind", "Rute", "Torkare", "Blad" };
 
-    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
-        new WhitespaceTokenizer(
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT, 
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
             new StringReader(
                 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
         dict);
@@ -112,8 +113,8 @@
         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
         "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
 
-    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
-        new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")),
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT, 
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("Basfiolsfodralmakaregesäll")),
         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
@@ -128,9 +129,9 @@
     String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
         "Aufgabe", "Überwachung" };
 
-    Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
+    Tokenizer wsTokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
         "Rindfleischüberwachungsgesetz"));
-    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT, 
         wsTokenizer, dict,
         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -24,6 +24,7 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -109,5 +110,11 @@
     
     assertAnalyzesToReuse(cz, "Česká Republika", new String[] { "česká" });
   }
-
+  
+  public void testWithStemExclusionSet() throws IOException{
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("hole");
+    CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
+  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java Fri Feb 26 13:09:54 2010
@@ -18,8 +18,12 @@
  */
 
 import java.io.IOException;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -270,4 +274,13 @@
     assertAnalyzesTo(cz, "e", new String[] { "e" });
     assertAnalyzesTo(cz, "zi", new String[] { "zi" });
   }
+  
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("hole");
+    CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerTokenFilter(
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hole desek")), set));
+    assertTokenStreamContents(filter, new String[] { "hole", "desk" });
+  }
+  
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Fri Feb 26 13:09:54 2010
@@ -21,9 +21,13 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
+import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;
 
 /**
@@ -35,6 +39,8 @@
 public class TestGermanStemFilter extends BaseTokenStreamTestCase {
 
   public void testStemming() throws Exception {
+    Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+    TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
     // read test cases from external file:
     File dataDir = new File(System.getProperty("dataDir", "./bin"));
     File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
@@ -50,36 +56,12 @@
         continue;    // ignore comments and empty lines
       String[] parts = line.split(";");
       //System.out.println(parts[0] + " -- " + parts[1]);
-      check(parts[0], parts[1]);
+      tokenizer.reset(new StringReader(parts[0]));
+      filter.reset();
+      assertTokenStreamContents(filter, new String[] { parts[1] });
     }
     breader.close();
     isr.close();
     fis.close();
   }
-  
-  public void testReusableTokenStream() throws Exception {
-    Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
-    checkReuse(a, "Tisch", "tisch");
-    checkReuse(a, "Tische", "tisch");
-    checkReuse(a, "Tischen", "tisch");
-  }
-  
-  /* 
-   * Test that changes to the exclusion table are applied immediately
-   * when using reusable token streams.
-   */
-  public void testExclusionTableReuse() throws Exception {
-    GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
-    checkReuse(a, "tischen", "tisch");
-    a.setStemExclusionTable(new String[] { "tischen" });
-    checkReuse(a, "tischen", "tischen");
-  }
-  
-  private void check(final String input, final String expected) throws Exception {
-    checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
-  }
-  
-  private void checkReuse(Analyzer a, String input, String expected) throws Exception {
-    checkOneTermReuse(a, input, expected);
-  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java Fri Feb 26 13:09:54 2010
@@ -18,7 +18,6 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
 
 /**
@@ -63,4 +62,23 @@
 	    assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3  \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
 	            new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
 	}
+	
+	/**
+	 * Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
+	 * check that this is preserved.
+	 * @deprecated remove this test in Lucene 4.0
+	 */
+	@Deprecated
+	public void testAcronymBWCompat() throws Exception {
+	  Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
+	  assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
+	}
+	
+  /**
+   * test that acronym normalization works
+   */
+  public void testAcronym() throws Exception {
+    Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
+  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java Fri Feb 26 13:09:54 2010
@@ -22,6 +22,7 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import org.apache.lucene.util.Version;
 
 /**
  * Test the Persian Normalization Filter
@@ -54,7 +55,7 @@
   }
 
   private void check(final String input, final String expected) throws IOException {
-    ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(
+    ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(Version.LUCENE_CURRENT, 
         new StringReader(input));
     PersianNormalizationFilter filter = new PersianNormalizationFilter(
         tokenStream);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -17,11 +17,10 @@
  * limitations under the License.
  */
 
-import java.io.StringReader;
+import java.io.IOException;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -116,6 +115,94 @@
 
 	}
 	
+	/**
+	 * @deprecated remove this test for Lucene 4.0
+	 */
+	@Deprecated
+	public void testAnalyzer30() throws Exception {
+	    FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
+	  
+	    assertAnalyzesTo(fa, "", new String[] {
+	    });
+
+	    assertAnalyzesTo(
+	      fa,
+	      "chien chat cheval",
+	      new String[] { "chien", "chat", "cheval" });
+
+	    assertAnalyzesTo(
+	      fa,
+	      "chien CHAT CHEVAL",
+	      new String[] { "chien", "chat", "cheval" });
+
+	    assertAnalyzesTo(
+	      fa,
+	      "  chien  ,? + = -  CHAT /: > CHEVAL",
+	      new String[] { "chien", "chat", "cheval" });
+
+	    assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+	    assertAnalyzesTo(
+	      fa,
+	      "mot \"entreguillemet\"",
+	      new String[] { "mot", "entreguillemet" });
+
+	    // let's do some french specific tests now  
+
+	    /* 1. couldn't resist
+	     I would expect this to stay one term as in French the minus 
+	    sign is often used for composing words */
+	    assertAnalyzesTo(
+	      fa,
+	      "Jean-François",
+	      new String[] { "jean", "françois" });
+
+	    // 2. stopwords
+	    assertAnalyzesTo(
+	      fa,
+	      "le la chien les aux chat du des à cheval",
+	      new String[] { "chien", "chat", "cheval" });
+
+	    // some nouns and adjectives
+	    assertAnalyzesTo(
+	      fa,
+	      "lances chismes habitable chiste éléments captifs",
+	      new String[] {
+	        "lanc",
+	        "chism",
+	        "habit",
+	        "chist",
+	        "élément",
+	        "captif" });
+
+	    // some verbs
+	    assertAnalyzesTo(
+	      fa,
+	      "finissions souffrirent rugissante",
+	      new String[] { "fin", "souffr", "rug" });
+
+	    // some everything else
+	    // aujourd'hui stays one term which is OK
+	    assertAnalyzesTo(
+	      fa,
+	      "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
+	      new String[] {
+	        "c3po",
+	        "aujourd'hui",
+	        "oeuf",
+	        "ïâöûàä",
+	        "anticonstitutionnel",
+	        "jav" });
+
+	    // some more everything else
+	    // here 1940-1945 stays as one term, 1940:1945 not ?
+	    assertAnalyzesTo(
+	      fa,
+	      "33Bis 1940-1945 1940:1945 (---i+++)*",
+	      new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
+
+	  }
+	
 	public void testReusableTokenStream() throws Exception {
 	  FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
 	  // stopwords
@@ -147,4 +234,41 @@
 	  fa.setStemExclusionTable(new String[] { "habitable" });
 	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
 	}
+	
+  public void testExclusionTableViaCtor() throws Exception {
+    CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+    set.add("habitable");
+    FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT,
+        CharArraySet.EMPTY_SET, set);
+    assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
+        "chist" });
+
+    fa = new FrenchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
+        "chist" });
+  }
+  
+  public void testElision() throws Exception {
+    FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+  }
+  
+  /**
+   * Prior to 3.1, this analyzer had no lowercase filter.
+   * stopwords were case sensitive. Preserve this for back compat.
+   * @deprecated Remove this test in Lucene 4.0
+   */
+  @Deprecated
+  public void testBuggyStopwordsCasing() throws IOException {
+    FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
+    assertAnalyzesTo(a, "Votre", new String[] { "votr" });
+  }
+  
+  /**
+   * Test that stopwords are not case sensitive
+   */
+  public void testStopwordsCasing() throws IOException {
+    FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "Votre", new String[] { });
+  }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -19,10 +19,8 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 import java.io.StringReader;
@@ -33,25 +31,13 @@
 
     PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
         new SingleTokenTokenStream(createToken("^", 0, 0)),
-        new WhitespaceTokenizer(new StringReader("hello world")),
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hello world")),
         new SingleTokenTokenStream(createToken("$", 0, 0)));
 
-    assertNext(ts, "^", 0, 0);
-    assertNext(ts, "hello", 0, 5);
-    assertNext(ts, "world", 6, 11);
-    assertNext(ts, "$", 11, 11);
-    assertFalse(ts.incrementToken());
-  }
-
-
-  private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
-    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
-
-    assertTrue(ts.incrementToken());
-    assertEquals(text, termAtt.term());
-    assertEquals(startOffset, offsetAtt.startOffset());
-    assertEquals(endOffset, offsetAtt.endOffset());
+    assertTokenStreamContents(ts,
+        new String[] { "^", "hello", "world", "$" },
+        new int[] { 0, 0, 6, 11 },
+        new int[] { 0, 5, 11, 11 });
   }
 
   private static Token createToken(String term, int start, int offset)

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -19,10 +19,8 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 import java.io.StringReader;
@@ -36,31 +34,21 @@
     ts = new PrefixAwareTokenFilter(
         new SingleTokenTokenStream(createToken("a", 0, 1)),
         new SingleTokenTokenStream(createToken("b", 0, 1)));
-    assertNext(ts, "a", 0, 1);
-    assertNext(ts, "b", 1, 2);
-    assertFalse(ts.incrementToken());
+    assertTokenStreamContents(ts, 
+        new String[] { "a", "b" },
+        new int[] { 0, 1 },
+        new int[] { 1, 2 });
 
     // prefix and suffix using 2x prefix
 
-    ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
+    ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)),
+        new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hello world")));
     ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
 
-    assertNext(ts, "^", 0, 0);
-    assertNext(ts, "hello", 0, 5);
-    assertNext(ts, "world", 6, 11);
-    assertNext(ts, "$", 11, 11);
-    assertFalse(ts.incrementToken());
-  }
-
-
-  private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
-    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
-
-    assertTrue(ts.incrementToken());
-    assertEquals(text, termAtt.term());
-    assertEquals(startOffset, offsetAtt.startOffset());
-    assertEquals(endOffset, offsetAtt.endOffset());
+    assertTokenStreamContents(ts,
+        new String[] { "^", "hello", "world", "$" },
+        new int[] { 0, 0, 6, 11 },
+        new int[] { 0, 5, 11, 11 });
   }
 
   private static Token createToken(String term, int start, int offset)

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -20,6 +20,7 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
 
 import java.io.StringReader;
 
@@ -32,7 +33,7 @@
   @Override
   public void setUp() throws Exception {
     super.setUp();
-    input = new WhitespaceTokenizer(new StringReader("abcde"));
+    input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
   }
 
   public void testInvalidInput() throws Exception {
@@ -91,13 +92,13 @@
   }
   
   public void testSmallTokenInStream() throws Exception {
-    input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+    input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
     EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
+    assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
   }
   
   public void testReset() throws Exception {
-    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
     EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
     assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
     tokenizer.reset(new StringReader("abcde"));

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -66,33 +66,33 @@
 
   public void testFrontUnigram() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
-    assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1});
+    assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1}, 5 /* abcde */);
   }
 
   public void testBackUnigram() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
-    assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5});
+    assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5}, 5 /* abcde */);
   }
 
   public void testOversizedNgrams() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
-    assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+    assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
   }
 
   public void testFrontRangeOfNgrams() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
   }
 
   public void testBackRangeOfNgrams() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
+    assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
   }
   
   public void testReset() throws Exception {
     EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
-    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
     tokenizer.reset(new StringReader("abcde"));
-    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
   }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -20,8 +20,8 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
 
-import java.io.IOException;
 import java.io.StringReader;
 
 /**
@@ -33,7 +33,7 @@
     @Override
     public void setUp() throws Exception {
         super.setUp();
-        input = new WhitespaceTokenizer(new StringReader("abcde"));
+        input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
     }
 
     public void testInvalidInput() throws Exception {
@@ -81,13 +81,13 @@
     }
     
     public void testSmallTokenInStream() throws Exception {
-      input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+      input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
       NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-      assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
+      assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
     }
     
     public void testReset() throws Exception {
-      WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+      WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
       NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
       assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
       tokenizer.reset(new StringReader("abcde"));

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -18,10 +18,8 @@
  */
 
 
-import java.io.IOException;
 import java.io.StringReader;
 
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 
 /**
@@ -58,12 +56,12 @@
 
     public void testUnigrams() throws Exception {
         NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-        assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+        assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
     }
 
     public void testBigrams() throws Exception {
         NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
-        assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+        assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
     }
 
     public void testNgrams() throws Exception {
@@ -71,19 +69,20 @@
         assertTokenStreamContents(tokenizer,
           new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
           new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-          new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
+          new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+          5 /* abcde */
         );
     }
 
     public void testOversizedNgrams() throws Exception {
         NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
-        assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+        assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
     }
     
     public void testReset() throws Exception {
       NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
       tokenizer.reset(new StringReader("abcde"));
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
     }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Fri Feb 26 13:09:54 2010
@@ -18,9 +18,11 @@
  */
 
 import java.io.File;
+import java.io.IOException;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.Version;
 
 /**
@@ -98,9 +100,6 @@
 	 check("ophalend", "ophal");
 	 check("ophalers", "ophaler");
 	 check("ophef", "ophef");
-	 check("opheffen", "ophef"); // versus snowball 'opheff'
-	 check("opheffende", "ophef"); // versus snowball 'opheff'
-	 check("opheffing", "ophef"); // versus snowball 'opheff'
 	 check("opheldering", "ophelder");
 	 check("ophemelde", "ophemeld");
 	 check("ophemelen", "ophemel");
@@ -116,6 +115,24 @@
 	 check("ophouden", "ophoud");
   }
   
+  /**
+   * @deprecated remove this test in Lucene 4.0
+   */
+  @Deprecated
+  public void testOldBuggyStemmer() throws Exception {
+    Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
+    checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
+    checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
+    checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
+  }
+  
+  public void testSnowballCorrectness() throws Exception {
+    Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
+    checkOneTermReuse(a, "opheffen", "opheff");
+    checkOneTermReuse(a, "opheffende", "opheff");
+    checkOneTermReuse(a, "opheffing", "opheff");
+  }
+  
   public void testReusableTokenStream() throws Exception {
     Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT); 
     checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
@@ -133,6 +150,19 @@
     checkOneTermReuse(a, "lichamelijk", "licham");
     a.setStemExclusionTable(new String[] { "lichamelijk" });
     checkOneTermReuse(a, "lichamelijk", "lichamelijk");
+
+    
+  }
+  
+  public void testExclusionTableViaCtor() throws IOException {
+    CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
+    set.add("lichamelijk");
+    DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+    
+    a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+    assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+
   }
   
   /* 
@@ -146,6 +176,25 @@
     checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
   }
   
+  /**
+   * Prior to 3.1, this analyzer had no lowercase filter.
+   * stopwords were case sensitive. Preserve this for back compat.
+   * @deprecated Remove this test in Lucene 4.0
+   */
+  @Deprecated
+  public void testBuggyStopwordsCasing() throws IOException {
+    DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
+    assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
+  }
+  
+  /**
+   * Test that stopwords are not case sensitive
+   */
+  public void testStopwordsCasing() throws IOException {
+    DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
+    assertAnalyzesTo(a, "Zelf", new String[] { });
+  }
+  
   private void check(final String input, final String expected) throws Exception {
     checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected); 
   }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -22,6 +22,7 @@
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.Version;
 
 import java.io.StringReader;
 
@@ -35,7 +36,7 @@
   public void testPayloads() throws Exception {
     String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
     DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
-      (new WhitespaceTokenizer(new StringReader(test)), 
+      (new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), 
        DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
     TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
     PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
@@ -56,7 +57,7 @@
 
     String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
     DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
-      (new WhitespaceTokenizer(new StringReader(test)), 
+      (new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), 
        DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
     assertTermEquals("The", filter, null);
     assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
@@ -74,7 +75,7 @@
 
   public void testFloatEncoding() throws Exception {
     String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
-    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)), '|', new FloatEncoder());
+    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), '|', new FloatEncoder());
     TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
     PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
     assertTermEquals("The", filter, termAtt, payAtt, null);
@@ -92,7 +93,7 @@
 
   public void testIntEncoding() throws Exception {
     String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
-    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)), '|', new IntegerEncoder());
+    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
     TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
     PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
     assertTermEquals("The", filter, termAtt, payAtt, null);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 import java.io.StringReader;
@@ -37,7 +38,7 @@
   public void test() throws IOException {
     String test = "The quick red fox jumped over the lazy brown dogs";
 
-    NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
+    NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))), 3, "D");
     boolean seenDogs = false;
     TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
     TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -21,6 +21,7 @@
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 import java.io.StringReader;
@@ -35,7 +36,7 @@
   public void test() throws IOException {
     String test = "The quick red fox jumped over the lazy brown dogs";
 
-    TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+    TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
     int count = 0;
     PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
     OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 import java.io.StringReader;
@@ -37,7 +38,7 @@
   public void test() throws IOException {
     String test = "The quick red fox jumped over the lazy brown dogs";
 
-    TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
+    TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))));
     int count = 0;
     TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
     TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java Fri Feb 26 13:09:54 2010
@@ -51,7 +51,7 @@
   protected void setUp() throws Exception {
     super.setUp();
     dir = new RAMDirectory();
-    appAnalyzer = new WhitespaceAnalyzer();
+    appAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
     IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
     int numDocs = 200;
     for (int i = 0; i < numDocs; i++) {
@@ -157,9 +157,9 @@
     @Override
     public TokenStream tokenStream(String fieldName, Reader reader) {
       if (++invocationCount % 2 == 0)
-        return new WhitespaceTokenizer(reader);
+        return new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
       else
-        return new LetterTokenizer(reader);
+        return new LetterTokenizer(Version.LUCENE_CURRENT, reader);
     }
   }
   
@@ -173,7 +173,7 @@
   }
   
   public void testTokenStream() throws Exception {
-    QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer());
+    QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
     a.addStopWords(reader, 10);
     TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
     TermAttribute termAtt = ts.getAttribute(TermAttribute.class);

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java Fri Feb 26 13:09:54 2010
@@ -27,9 +27,9 @@
 
 public class TestReverseStringFilter extends BaseTokenStreamTestCase {
   public void testFilter() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(
+    TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, 
         new StringReader("Do have a nice day"));     // 1-4 length string
-    ReverseStringFilter filter = new ReverseStringFilter(stream);
+    ReverseStringFilter filter = new ReverseStringFilter(Version.LUCENE_CURRENT, stream);
     TermAttribute text = filter.getAttribute(TermAttribute.class);
     assertTrue(filter.incrementToken());
     assertEquals("oD", text.term());
@@ -45,9 +45,9 @@
   }
   
   public void testFilterWithMark() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(new StringReader(
+    TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
         "Do have a nice day")); // 1-4 length string
-    ReverseStringFilter filter = new ReverseStringFilter(stream, '\u0001');
+    ReverseStringFilter filter = new ReverseStringFilter(Version.LUCENE_CURRENT, stream, '\u0001');
     TermAttribute text = filter
         .getAttribute(TermAttribute.class);
     assertTrue(filter.incrementToken());

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -26,6 +26,7 @@
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.Version;
@@ -49,9 +50,14 @@
       dataDir = new File(System.getProperty("dataDir", "./bin"));
     }
 
-    public void testUnicode() throws IOException
+    /**
+     * @deprecated remove this test and its datafiles in Lucene 4.0
+     * the Snowball version has its own data tests.
+     */
+    @Deprecated
+    public void testUnicode30() throws IOException
     {
-        RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
+        RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30);
         inWords =
             new InputStreamReader(
                 new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@@ -65,7 +71,7 @@
         TokenStream in = ra.tokenStream("all", inWords);
 
         RussianLetterTokenizer sample =
-            new RussianLetterTokenizer(
+            new RussianLetterTokenizer(Version.LUCENE_CURRENT,
                 sampleUnicode);
 
         TermAttribute text = in.getAttribute(TermAttribute.class);
@@ -109,11 +115,31 @@
         }
     }
     
+    /** @deprecated remove this test in Lucene 4.0: stopwords changed */
+    @Deprecated
+    public void testReusableTokenStream30() throws Exception {
+      Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
+      assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
+          new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
+      assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
+          new String[] { "знан", "хран", "тайн" });
+    }
+    
     public void testReusableTokenStream() throws Exception {
       Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
       assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
           new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
       assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
-          new String[] { "знан", "хран", "тайн" });
+          new String[] { "знан", "эт", "хран", "тайн" });
+    }
+    
+    
+    public void testWithStemExclusionSet() throws Exception {
+      CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+      set.add("представление");
+      Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
+      assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
+          new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
+     
     }
 }

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java Fri Feb 26 13:09:54 2010
@@ -24,6 +24,10 @@
 import java.io.FileInputStream;
 import java.util.ArrayList;
 
+/**
+ * @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
+ */
+@Deprecated
 public class TestRussianStem extends LuceneTestCase
 {
     private ArrayList words = new ArrayList();

Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Fri Feb 26 13:09:54 2010
@@ -106,7 +106,7 @@
    */
   public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
-                                     (new WhitespaceAnalyzer(), 2),
+                                     (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
                                  "test sentence");
     int[] ranks = new int[] { 1, 2, 0 };
     compareRanks(hits, ranks);
@@ -117,7 +117,7 @@
    */
   public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
-                                     (new WhitespaceAnalyzer(), 2),
+                                     (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
                                  "\"this sentence\"");
     int[] ranks = new int[] { 0 };
     compareRanks(hits, ranks);
@@ -128,7 +128,7 @@
    */
   public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
-                                     (new WhitespaceAnalyzer(), 2),
+                                     (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
                                  "\"test sentence\"");
     int[] ranks = new int[] { 1 };
     compareRanks(hits, ranks);
@@ -139,7 +139,7 @@
    */
   public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
     ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
-                                     (new WhitespaceAnalyzer(), 2),
+                                     (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
                                  "+test +sentence");
     int[] ranks = new int[] { 1, 2 };
     compareRanks(hits, ranks);
@@ -149,7 +149,7 @@
    * This shows how to construct a phrase query containing shingles.
    */
   public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
-    Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+    Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
     searcher = setUpSearcher(analyzer);
 
     PhraseQuery q = new PhraseQuery();
@@ -178,7 +178,7 @@
    * in the right order and adjacent to each other.
    */
   public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
-    Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+    Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
     searcher = setUpSearcher(analyzer);
 
     BooleanQuery q = new BooleanQuery();
@@ -200,7 +200,7 @@
   }
   
   public void testReusableTokenStream() throws Exception {
-    Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+    Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
     assertAnalyzesToReuse(a, "please divide into shingles",
         new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
         new int[] { 0, 0, 7, 7, 14, 14, 19 },
@@ -222,9 +222,9 @@
     @Override
     public TokenStream tokenStream(String fieldName, Reader reader) {
       if (++invocationCount % 2 == 0)
-        return new WhitespaceTokenizer(reader);
+        return new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
       else
-        return new LetterTokenizer(reader);
+        return new LetterTokenizer(Version.LUCENE_CURRENT, reader);
     }
   }
   
@@ -246,4 +246,117 @@
         new int[] { 6, 13, 13, 18, 18, 27, 27 },
         new int[] { 1, 0, 1, 0, 1, 0, 1 });
   }
+
+  public void testNonDefaultMinShingleSize() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please",   "please divide this",   "please divide this sentence", 
+                                         "divide",   "divide this sentence", "divide this sentence into", 
+                                         "this",     "this sentence into",   "this sentence into shingles",
+                                         "sentence", "sentence into shingles",
+                                         "into",
+                                         "shingles" },
+                          new int[] { 0,  0,  0,  7,  7,  7, 14, 14, 14, 19, 19, 28, 33 },
+                          new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
+                          new int[] { 1,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please divide this",   "please divide this sentence", 
+                                         "divide this sentence", "divide this sentence into", 
+                                         "this sentence into",   "this sentence into shingles",
+                                         "sentence into shingles" },
+                          new int[] {  0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 18, 27, 27, 32, 32, 41, 41 },
+                          new int[] {  1,  0,  1,  0,  1,  0,  1 });
+  }
+  
+  public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
+    ShingleAnalyzerWrapper analyzer
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please",   "please divide this", 
+                                         "divide",   "divide this sentence", 
+                                         "this",     "this sentence into",
+                                         "sentence", "sentence into shingles",
+                                         "into",
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19, 19, 28, 33 },
+                          new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1,  0,  1,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please divide this", 
+                                         "divide this sentence", 
+                                         "this sentence into",
+                                         "sentence into shingles" },
+                          new int[] {  0,  7, 14, 19 },
+                          new int[] { 18, 27, 32, 41 },
+                          new int[] {  1,  1,  1,  1 });
+  }
+
+  public void testNoTokenSeparator() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+    analyzer.setTokenSeparator("");
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please", "pleasedivide", 
+                                         "divide", "divideinto", 
+                                         "into", "intoshingles", 
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 6, 13, 13, 18, 18, 27, 27 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "pleasedivide", 
+                                         "divideinto", 
+                                         "intoshingles" },
+                          new int[] {  0,  7, 14 },
+                          new int[] { 13, 18, 27 },
+                          new int[] {  1,  1,  1 });
+  }
+
+  public void testNullTokenSeparator() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+    analyzer.setTokenSeparator(null);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please", "pleasedivide", 
+                                         "divide", "divideinto", 
+                                         "into", "intoshingles", 
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 6, 13, 13, 18, 18, 27, 27 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "pleasedivide", 
+                                         "divideinto", 
+                                         "intoshingles" },
+                          new int[] {  0,  7, 14 },
+                          new int[] { 13, 18, 27 },
+                          new int[] {  1,  1,  1 });
+  }
+  public void testAltTokenSeparator() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+    analyzer.setTokenSeparator("<SEP>");
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please", "please<SEP>divide", 
+                                         "divide", "divide<SEP>into", 
+                                         "into", "into<SEP>shingles", 
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 6, 13, 13, 18, 18, 27, 27 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please<SEP>divide", 
+                                         "divide<SEP>into", 
+                                         "into<SEP>shingles" },
+                          new int[] {  0,  7, 14 },
+                          new int[] { 13, 18, 27 },
+                          new int[] {  1,  1,  1 });
+  }
 }