You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/04 11:11:06 UTC

svn commit: r940781 - in /lucene/dev/trunk: lucene/contrib/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/ solr/src/java/org/apache/s...

Author: rmuir
Date: Tue May  4 09:11:05 2010
New Revision: 940781

URL: http://svn.apache.org/viewvc?rev=940781&view=rev
Log:
LUCENE-2413: consolidate WDF into contrib/analyzers

Added:
    lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
      - copied, changed from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
    lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
      - copied, changed from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
    lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
      - copied, changed from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
      - copied, changed from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Removed:
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
    lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=940781&r1=940780&r2=940781&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Tue May  4 09:11:05 2010
@@ -161,6 +161,8 @@ New features
      and phrases. 
    - o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML 
      constructs.
+   - o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words 
+     into subwords and performs optional transformations on subword groups.
    (... in progress)
 
 Build

Copied: lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java&p1=lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java Tue May  4 09:11:05 2010
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
  
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -58,10 +58,9 @@ import java.io.IOException;
  *  in the analyzer used for querying.  Given that the current StandardTokenizer immediately removes many intra-word
  *  delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
  *
- *  @version $Id$
  */
 
-final class WordDelimiterFilter extends TokenFilter {
+public final class WordDelimiterFilter extends TokenFilter {
   
   public static final int LOWER = 0x01;
   public static final int UPPER = 0x02;

Copied: lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java (from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java?p2=lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java&p1=lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java (original)
+++ lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java Tue May  4 09:11:05 2010
@@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,7 +17,7 @@ package org.apache.solr.analysis;
  * limitations under the License.
  */
 
-import static org.apache.solr.analysis.WordDelimiterFilter.*;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
 
 /**
  * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.

Copied: lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java&p1=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Tue May  4 09:11:05 2010
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.KeywordTokenizer;
 import org.apache.lucene.analysis.StopFilter;
@@ -29,12 +30,8 @@ import org.apache.lucene.analysis.miscel
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.solr.SolrTestCaseJ4;
-import org.junit.BeforeClass;
 import org.junit.Test;
 
-import static org.apache.solr.analysis.BaseTokenTestCase.*;
-
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@@ -43,96 +40,10 @@ import java.util.HashSet;
 
 /**
  * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
+ * TODO: should explicitly test things like protWords and not rely on
+ * the factory tests in Solr.
  */
-public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
-
-  @BeforeClass
-  public static void beforeClass() throws Exception {
-    initCore("solrconfig.xml","schema.xml");
-  }
-
-  public void posTst(String v1, String v2, String s1, String s2) {
-    assertU(adoc("id",  "42",
-                 "subword", v1,
-                 "subword", v2));
-    assertU(commit());
-
-    // there is a positionIncrementGap of 100 between field values, so
-    // we test if that was maintained.
-    assertQ("position increment lost",
-            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
-            ,"//result[@numFound=0]"
-    );
-    assertQ("position increment lost",
-            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
-            ,"//result[@numFound=1]"
-    );
-    clearIndex();
-  }
-
-  @Test
-  public void testRetainPositionIncrement() {
-    posTst("foo","bar","foo","bar");
-    posTst("-foo-","-bar-","foo","bar");
-    posTst("foo","bar","-foo-","-bar-");
-
-    posTst("123","456","123","456");
-    posTst("/123/","/456/","123","456");
-
-    posTst("/123/abc","qwe/456/","abc","qwe");
-
-    posTst("zoo-foo","bar-baz","foo","bar");
-    posTst("zoo-foo-123","456-bar-baz","foo","bar");
-  }
-
-  @Test
-  public void testNoGenerationEdgeCase() {
-    assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
-    clearIndex();
-  }
-
-  @Test
-  public void testIgnoreCaseChange() {
-
-    assertU(adoc("id",  "43",
-                 "wdf_nocase", "HellO WilliAM",
-                 "subword", "GoodBye JonEs"));
-    assertU(commit());
-    
-    assertQ("no case change",
-            req("wdf_nocase:(hell o am)")
-            ,"//result[@numFound=0]"
-    );
-    assertQ("case change",
-            req("subword:(good jon)")
-            ,"//result[@numFound=1]"
-    );
-    clearIndex();
-  }
-
-  @Test
-  public void testPreserveOrignalTrue() {
-
-    assertU(adoc("id",  "144",
-                 "wdf_preserve", "404-123"));
-    assertU(commit());
-    
-    assertQ("preserving original word",
-            req("wdf_preserve:404")
-            ,"//result[@numFound=1]"
-    );
-    
-    assertQ("preserving original word",
-        req("wdf_preserve:123")
-        ,"//result[@numFound=1]"
-    );
-
-    assertQ("preserving original word",
-        req("wdf_preserve:404-123*")
-        ,"//result[@numFound=1]"
-    );
-    clearIndex();
-  }
+public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
 
   /***
   public void testPerformance() throws IOException {
@@ -232,59 +143,6 @@ public class TestWordDelimiterFilter ext
         new int[] { 11, 15, 15 });
   }
 
-  @Test
-  public void testAlphaNumericWords(){
-     assertU(adoc("id",  "68","numericsubword","Java/J2SE"));
-     assertU(commit());
-
-     assertQ("j2se found",
-            req("numericsubword:(J2SE)")
-            ,"//result[@numFound=1]"
-    );
-      assertQ("no j2 or se",
-            req("numericsubword:(J2 OR SE)")
-            ,"//result[@numFound=0]"
-    );
-    clearIndex();
-  }
-
-  @Test
-  public void testProtectedWords(){
-    assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
-    assertU(commit());
-
-    assertQ("java found",
-            req("protectedsubword:(java)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ(".net found",
-            req("protectedsubword:(.net)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ("c# found",
-            req("protectedsubword:(c#)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ("c++ found",
-            req("protectedsubword:(c++)")
-            ,"//result[@numFound=1]"
-    );
-
-    assertQ("c found?",
-            req("protectedsubword:c")
-            ,"//result[@numFound=0]"
-    );
-    assertQ("net found?",
-            req("protectedsubword:net")
-            ,"//result[@numFound=0]"
-    );
-    clearIndex();
-  }
-
-
   public void doSplit(final String input, String... output) throws Exception {
     WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
         new StringReader(input)), 1, 1, 0, 0, 0);
@@ -368,13 +226,13 @@ public class TestWordDelimiterFilter ext
   
   @Test
   public void testPositionIncrements() throws Exception {
-    final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
+    final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("NUTCH")), false);
     
     /* analyzer that uses whitespace + wdf */
     Analyzer a = new Analyzer() {
       public TokenStream tokenStream(String field, Reader reader) {
         return new WordDelimiterFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader),
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
             1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
       }
     };
@@ -401,7 +259,7 @@ public class TestWordDelimiterFilter ext
       public TokenStream tokenStream(String field, Reader reader) {
         return new WordDelimiterFilter(
             new LargePosIncTokenFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
             1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
       }
     };
@@ -431,8 +289,8 @@ public class TestWordDelimiterFilter ext
 
     Analyzer a3 = new Analyzer() {
       public TokenStream tokenStream(String field, Reader reader) {
-        StopFilter filter = new StopFilter(DEFAULT_VERSION,
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
+        StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
+            new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
         filter.setEnablePositionIncrements(true);
         return new WordDelimiterFilter(filter, 
             1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);

Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java?rev=940781&r1=940780&r2=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java Tue May  4 09:11:05 2010
@@ -18,6 +18,7 @@
 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.apache.solr.common.ResourceLoader;

Copied: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java?p2=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java&p1=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java Tue May  4 09:11:05 2010
@@ -17,34 +17,14 @@
 
 package org.apache.solr.analysis;
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.KeywordTokenizer;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.solr.SolrTestCaseJ4;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import static org.apache.solr.analysis.BaseTokenTestCase.*;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.Arrays;
-import java.util.HashSet;
-
 /**
  * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
  */
-public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
+public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 {
 
   @BeforeClass
   public static void beforeClass() throws Exception {
@@ -153,86 +133,6 @@ public class TestWordDelimiterFilter ext
   ***/
 
   @Test
-  public void testOffsets() throws IOException {
-
-    // test that subwords and catenated subwords have
-    // the correct offsets.
-    WordDelimiterFilter wdf = new WordDelimiterFilter(
-            new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
-    1,1,0,0,1,1,0);
-
-    assertTokenStreamContents(wdf, 
-        new String[] { "foo", "bar", "foobar" },
-        new int[] { 5, 9, 5 }, 
-        new int[] { 8, 12, 12 });
-
-    wdf = new WordDelimiterFilter(
-            new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
-    1,1,0,0,1,1,0);
-    
-    assertTokenStreamContents(wdf,
-        new String[] { "foo", "bar", "foobar" },
-        new int[] { 5, 5, 5 },
-        new int[] { 6, 6, 6 });
-  }
-  
-  @Test
-  public void testOffsetChange() throws Exception
-  {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)),
-      1,1,0,0,1,1,0
-    );
-    
-    assertTokenStreamContents(wdf,
-        new String[] { "übelkeit" },
-        new int[] { 7 },
-        new int[] { 15 });
-  }
-  
-  @Test
-  public void testOffsetChange2() throws Exception
-  {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)),
-      1,1,0,0,1,1,0
-    );
-    
-    assertTokenStreamContents(wdf,
-        new String[] { "übelkeit" },
-        new int[] { 8 },
-        new int[] { 17 });
-  }
-  
-  @Test
-  public void testOffsetChange3() throws Exception
-  {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)),
-      1,1,0,0,1,1,0
-    );
-    
-    assertTokenStreamContents(wdf,
-        new String[] { "übelkeit" },
-        new int[] { 8 },
-        new int[] { 16 });
-  }
-  
-  @Test
-  public void testOffsetChange4() throws Exception
-  {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
-      1,1,0,0,1,1,0
-    );
-    
-    assertTokenStreamContents(wdf,
-        new String[] { "foo", "bar", "foobar"},
-        new int[] { 8, 12, 8 },
-        new int[] { 11, 15, 15 });
-  }
-
-  @Test
   public void testAlphaNumericWords(){
      assertU(adoc("id",  "68","numericsubword","Java/J2SE"));
      assertU(commit());
@@ -283,173 +183,4 @@ public class TestWordDelimiterFilter ext
     );
     clearIndex();
   }
-
-
-  public void doSplit(final String input, String... output) throws Exception {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
-        new StringReader(input)), 1, 1, 0, 0, 0);
-    
-    assertTokenStreamContents(wdf, output);
-  }
-
-  @Test
-  public void testSplits() throws Exception {
-    doSplit("basic-split","basic","split");
-    doSplit("camelCase","camel","Case");
-
-    // non-space marking symbol shouldn't cause split
-    // this is an example in Thai    
-    doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
-    // possessive followed by delimiter
-    doSplit("test's'", "test");
-
-    // some russian upper and lowercase
-    doSplit("Роберт", "Роберт");
-    // now cause a split (russian camelCase)
-    doSplit("РобЕрт", "Роб", "Ерт");
-
-    // a composed titlecase character, don't split
-    doSplit("aDžungla", "aDžungla");
-    
-    // a modifier letter, don't split
-    doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام");
-    
-    // enclosing mark, don't split
-    doSplit("۞test", "۞test");
-    
-    // combining spacing mark (the virama), don't split
-    doSplit("हिन्दी", "हिन्दी");
-    
-    // don't split non-ascii digits
-    doSplit("١٢٣٤", "١٢٣٤");
-    
-    // don't split supplementaries into unpaired surrogates
-    doSplit("𠀀𠀀", "𠀀𠀀");
-  }
-  
-  public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
-        new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
-
-    assertTokenStreamContents(wdf, output);
-  }
-  
-  /*
-   * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. 
-   */
-  @Test
-  public void testPossessives() throws Exception {
-    doSplitPossessive(1, "ra's", "ra");
-    doSplitPossessive(0, "ra's", "ra", "s");
-  }
-  
-  /*
-   * Set a large position increment gap of 10 if the token is "largegap" or "/"
-   */
-  private final class LargePosIncTokenFilter extends TokenFilter {
-    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-    
-    protected LargePosIncTokenFilter(TokenStream input) {
-      super(input);
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      if (input.incrementToken()) {
-        if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/"))
-          posIncAtt.setPositionIncrement(10);
-        return true;
-      } else {
-        return false;
-      }
-    }  
-  }
-  
-  @Test
-  public void testPositionIncrements() throws Exception {
-    final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
-    
-    /* analyzer that uses whitespace + wdf */
-    Analyzer a = new Analyzer() {
-      public TokenStream tokenStream(String field, Reader reader) {
-        return new WordDelimiterFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader),
-            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
-      }
-    };
-
-    /* in this case, works as expected. */
-    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
-        new int[] { 0, 9 },
-        new int[] { 6, 13 },
-        new int[] { 1, 1 });
-    
-    /* only in this case, posInc of 2 ?! */
-    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
-        new int[] { 0, 9, 12, 9 },
-        new int[] { 6, 12, 13, 13 },
-        new int[] { 1, 1, 1, 0 });
-    
-    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
-        new int[] { 0, 9, 15 },
-        new int[] { 6, 14, 19 },
-        new int[] { 1, 1, 1 });
-    
-    /* analyzer that will consume tokens with large position increments */
-    Analyzer a2 = new Analyzer() {
-      public TokenStream tokenStream(String field, Reader reader) {
-        return new WordDelimiterFilter(
-            new LargePosIncTokenFilter(
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
-            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
-      }
-    };
-    
-    /* increment of "largegap" is preserved */
-    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
-        new int[] { 0, 7, 16 },
-        new int[] { 6, 15, 20 },
-        new int[] { 1, 10, 1 });
-    
-    /* the "/" had a position increment of 10, where did it go?!?!! */
-    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
-        new int[] { 0, 9 },
-        new int[] { 6, 13 },
-        new int[] { 1, 11 });
-    
-    /* in this case, the increment of 10 from the "/" is carried over */
-    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
-        new int[] { 0, 9, 12, 9 },
-        new int[] { 6, 12, 13, 13 },
-        new int[] { 1, 11, 1, 0 });
-    
-    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
-        new int[] { 0, 9, 15 },
-        new int[] { 6, 14, 19 },
-        new int[] { 1, 11, 1 });
-
-    Analyzer a3 = new Analyzer() {
-      public TokenStream tokenStream(String field, Reader reader) {
-        StopFilter filter = new StopFilter(DEFAULT_VERSION,
-            new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
-        filter.setEnablePositionIncrements(true);
-        return new WordDelimiterFilter(filter, 
-            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
-      }
-    };
-
-    assertAnalyzesTo(a3, "lucene.solr", 
-        new String[] { "lucene", "solr", "lucenesolr" },
-        new int[] { 0, 7, 0 },
-        new int[] { 6, 11, 11 },
-        new int[] { 1, 1, 0 });
-
-    /* the stopword should add a gap here */
-    assertAnalyzesTo(a3, "the lucene.solr", 
-        new String[] { "lucene", "solr", "lucenesolr" }, 
-        new int[] { 4, 11, 4 }, 
-        new int[] { 10, 15, 15 },
-        new int[] { 2, 1, 0 });
-  }
 }