You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/04 11:11:06 UTC
svn commit: r940781 - in /lucene/dev/trunk: lucene/contrib/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/
solr/src/java/org/apache/s...
Author: rmuir
Date: Tue May 4 09:11:05 2010
New Revision: 940781
URL: http://svn.apache.org/viewvc?rev=940781&view=rev
Log:
LUCENE-2413: consolidate WDF into contrib/analyzers
Added:
lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
- copied, changed from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
- copied, changed from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
- copied, changed from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java
- copied, changed from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Removed:
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=940781&r1=940780&r2=940781&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Tue May 4 09:11:05 2010
@@ -161,6 +161,8 @@ New features
and phrases.
- o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML
constructs.
+ - o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words
+ into subwords and performs optional transformations on subword groups.
(... in progress)
Build
Copied: lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java (from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java&p1=lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java Tue May 4 09:11:05 2010
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -58,10 +58,9 @@ import java.io.IOException;
* in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
* delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
*
- * @version $Id$
*/
-final class WordDelimiterFilter extends TokenFilter {
+public final class WordDelimiterFilter extends TokenFilter {
public static final int LOWER = 0x01;
public static final int UPPER = 0x02;
Copied: lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java (from r940768, lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java?p2=lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java&p1=lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterIterator.java (original)
+++ lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java Tue May 4 09:11:05 2010
@@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,7 +17,7 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
-import static org.apache.solr.analysis.WordDelimiterFilter.*;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
/**
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
Copied: lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java?p2=lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java&p1=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java Tue May 4 09:11:05 2010
@@ -15,9 +15,10 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.StopFilter;
@@ -29,12 +30,8 @@ import org.apache.lucene.analysis.miscel
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.solr.SolrTestCaseJ4;
-import org.junit.BeforeClass;
import org.junit.Test;
-import static org.apache.solr.analysis.BaseTokenTestCase.*;
-
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@@ -43,96 +40,10 @@ import java.util.HashSet;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
+ * TODO: should explicitly test things like protWords and not rely on
+ * the factory tests in Solr.
*/
-public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
-
- @BeforeClass
- public static void beforeClass() throws Exception {
- initCore("solrconfig.xml","schema.xml");
- }
-
- public void posTst(String v1, String v2, String s1, String s2) {
- assertU(adoc("id", "42",
- "subword", v1,
- "subword", v2));
- assertU(commit());
-
- // there is a positionIncrementGap of 100 between field values, so
- // we test if that was maintained.
- assertQ("position increment lost",
- req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
- ,"//result[@numFound=0]"
- );
- assertQ("position increment lost",
- req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
- ,"//result[@numFound=1]"
- );
- clearIndex();
- }
-
- @Test
- public void testRetainPositionIncrement() {
- posTst("foo","bar","foo","bar");
- posTst("-foo-","-bar-","foo","bar");
- posTst("foo","bar","-foo-","-bar-");
-
- posTst("123","456","123","456");
- posTst("/123/","/456/","123","456");
-
- posTst("/123/abc","qwe/456/","abc","qwe");
-
- posTst("zoo-foo","bar-baz","foo","bar");
- posTst("zoo-foo-123","456-bar-baz","foo","bar");
- }
-
- @Test
- public void testNoGenerationEdgeCase() {
- assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
- clearIndex();
- }
-
- @Test
- public void testIgnoreCaseChange() {
-
- assertU(adoc("id", "43",
- "wdf_nocase", "HellO WilliAM",
- "subword", "GoodBye JonEs"));
- assertU(commit());
-
- assertQ("no case change",
- req("wdf_nocase:(hell o am)")
- ,"//result[@numFound=0]"
- );
- assertQ("case change",
- req("subword:(good jon)")
- ,"//result[@numFound=1]"
- );
- clearIndex();
- }
-
- @Test
- public void testPreserveOrignalTrue() {
-
- assertU(adoc("id", "144",
- "wdf_preserve", "404-123"));
- assertU(commit());
-
- assertQ("preserving original word",
- req("wdf_preserve:404")
- ,"//result[@numFound=1]"
- );
-
- assertQ("preserving original word",
- req("wdf_preserve:123")
- ,"//result[@numFound=1]"
- );
-
- assertQ("preserving original word",
- req("wdf_preserve:404-123*")
- ,"//result[@numFound=1]"
- );
- clearIndex();
- }
+public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
/***
public void testPerformance() throws IOException {
@@ -232,59 +143,6 @@ public class TestWordDelimiterFilter ext
new int[] { 11, 15, 15 });
}
- @Test
- public void testAlphaNumericWords(){
- assertU(adoc("id", "68","numericsubword","Java/J2SE"));
- assertU(commit());
-
- assertQ("j2se found",
- req("numericsubword:(J2SE)")
- ,"//result[@numFound=1]"
- );
- assertQ("no j2 or se",
- req("numericsubword:(J2 OR SE)")
- ,"//result[@numFound=0]"
- );
- clearIndex();
- }
-
- @Test
- public void testProtectedWords(){
- assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
- assertU(commit());
-
- assertQ("java found",
- req("protectedsubword:(java)")
- ,"//result[@numFound=1]"
- );
-
- assertQ(".net found",
- req("protectedsubword:(.net)")
- ,"//result[@numFound=1]"
- );
-
- assertQ("c# found",
- req("protectedsubword:(c#)")
- ,"//result[@numFound=1]"
- );
-
- assertQ("c++ found",
- req("protectedsubword:(c++)")
- ,"//result[@numFound=1]"
- );
-
- assertQ("c found?",
- req("protectedsubword:c")
- ,"//result[@numFound=0]"
- );
- assertQ("net found?",
- req("protectedsubword:net")
- ,"//result[@numFound=0]"
- );
- clearIndex();
- }
-
-
public void doSplit(final String input, String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
new StringReader(input)), 1, 1, 0, 0, 0);
@@ -368,13 +226,13 @@ public class TestWordDelimiterFilter ext
@Test
public void testPositionIncrements() throws Exception {
- final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
+ final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("NUTCH")), false);
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
public TokenStream tokenStream(String field, Reader reader) {
return new WordDelimiterFilter(
- new WhitespaceTokenizer(DEFAULT_VERSION, reader),
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
}
};
@@ -401,7 +259,7 @@ public class TestWordDelimiterFilter ext
public TokenStream tokenStream(String field, Reader reader) {
return new WordDelimiterFilter(
new LargePosIncTokenFilter(
- new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
}
};
@@ -431,8 +289,8 @@ public class TestWordDelimiterFilter ext
Analyzer a3 = new Analyzer() {
public TokenStream tokenStream(String field, Reader reader) {
- StopFilter filter = new StopFilter(DEFAULT_VERSION,
- new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
+ StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
filter.setEnablePositionIncrements(true);
return new WordDelimiterFilter(filter,
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java?rev=940781&r1=940780&r2=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java Tue May 4 09:11:05 2010
@@ -18,6 +18,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader;
Copied: lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (from r940768, lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java?p2=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java&p1=lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java&r1=940768&r2=940781&rev=940781&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java Tue May 4 09:11:05 2010
@@ -17,34 +17,14 @@
package org.apache.solr.analysis;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.KeywordTokenizer;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
-import static org.apache.solr.analysis.BaseTokenTestCase.*;
-
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.Arrays;
-import java.util.HashSet;
-
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/
-public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
+public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
@@ -153,86 +133,6 @@ public class TestWordDelimiterFilter ext
***/
@Test
- public void testOffsets() throws IOException {
-
- // test that subwords and catenated subwords have
- // the correct offsets.
- WordDelimiterFilter wdf = new WordDelimiterFilter(
- new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
- 1,1,0,0,1,1,0);
-
- assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar" },
- new int[] { 5, 9, 5 },
- new int[] { 8, 12, 12 });
-
- wdf = new WordDelimiterFilter(
- new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
- 1,1,0,0,1,1,0);
-
- assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar" },
- new int[] { 5, 5, 5 },
- new int[] { 6, 6, 6 });
- }
-
- @Test
- public void testOffsetChange() throws Exception
- {
- WordDelimiterFilter wdf = new WordDelimiterFilter(
- new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)),
- 1,1,0,0,1,1,0
- );
-
- assertTokenStreamContents(wdf,
- new String[] { "übelkeit" },
- new int[] { 7 },
- new int[] { 15 });
- }
-
- @Test
- public void testOffsetChange2() throws Exception
- {
- WordDelimiterFilter wdf = new WordDelimiterFilter(
- new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)),
- 1,1,0,0,1,1,0
- );
-
- assertTokenStreamContents(wdf,
- new String[] { "übelkeit" },
- new int[] { 8 },
- new int[] { 17 });
- }
-
- @Test
- public void testOffsetChange3() throws Exception
- {
- WordDelimiterFilter wdf = new WordDelimiterFilter(
- new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)),
- 1,1,0,0,1,1,0
- );
-
- assertTokenStreamContents(wdf,
- new String[] { "übelkeit" },
- new int[] { 8 },
- new int[] { 16 });
- }
-
- @Test
- public void testOffsetChange4() throws Exception
- {
- WordDelimiterFilter wdf = new WordDelimiterFilter(
- new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
- 1,1,0,0,1,1,0
- );
-
- assertTokenStreamContents(wdf,
- new String[] { "foo", "bar", "foobar"},
- new int[] { 8, 12, 8 },
- new int[] { 11, 15, 15 });
- }
-
- @Test
public void testAlphaNumericWords(){
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
assertU(commit());
@@ -283,173 +183,4 @@ public class TestWordDelimiterFilter ext
);
clearIndex();
}
-
-
- public void doSplit(final String input, String... output) throws Exception {
- WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
- new StringReader(input)), 1, 1, 0, 0, 0);
-
- assertTokenStreamContents(wdf, output);
- }
-
- @Test
- public void testSplits() throws Exception {
- doSplit("basic-split","basic","split");
- doSplit("camelCase","camel","Case");
-
- // non-space marking symbol shouldn't cause split
- // this is an example in Thai
- doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
- // possessive followed by delimiter
- doSplit("test's'", "test");
-
- // some russian upper and lowercase
- doSplit("РобеÑÑ", "РобеÑÑ");
- // now cause a split (russian camelCase)
- doSplit("РобÐÑÑ", "Роб", "ÐÑÑ");
-
- // a composed titlecase character, don't split
- doSplit("aÇ
ungla", "aÇ
ungla");
-
- // a modifier letter, don't split
- doSplit("سÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙاÙ
", "سÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙاÙ
");
-
- // enclosing mark, don't split
- doSplit("Ûtest", "Ûtest");
-
- // combining spacing mark (the virama), don't split
- doSplit("हिनà¥à¤¦à¥", "हिनà¥à¤¦à¥");
-
- // don't split non-ascii digits
- doSplit("١٢٣٤", "١٢٣٤");
-
- // don't split supplementaries into unpaired surrogates
- doSplit("ð ð ", "ð ð ");
- }
-
- public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
- WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
- new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
-
- assertTokenStreamContents(wdf, output);
- }
-
- /*
- * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
- */
- @Test
- public void testPossessives() throws Exception {
- doSplitPossessive(1, "ra's", "ra");
- doSplitPossessive(0, "ra's", "ra", "s");
- }
-
- /*
- * Set a large position increment gap of 10 if the token is "largegap" or "/"
- */
- private final class LargePosIncTokenFilter extends TokenFilter {
- private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-
- protected LargePosIncTokenFilter(TokenStream input) {
- super(input);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (input.incrementToken()) {
- if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/"))
- posIncAtt.setPositionIncrement(10);
- return true;
- } else {
- return false;
- }
- }
- }
-
- @Test
- public void testPositionIncrements() throws Exception {
- final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
-
- /* analyzer that uses whitespace + wdf */
- Analyzer a = new Analyzer() {
- public TokenStream tokenStream(String field, Reader reader) {
- return new WordDelimiterFilter(
- new WhitespaceTokenizer(DEFAULT_VERSION, reader),
- 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
- }
- };
-
- /* in this case, works as expected. */
- assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
- new int[] { 0, 9 },
- new int[] { 6, 13 },
- new int[] { 1, 1 });
-
- /* only in this case, posInc of 2 ?! */
- assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
- new int[] { 0, 9, 12, 9 },
- new int[] { 6, 12, 13, 13 },
- new int[] { 1, 1, 1, 0 });
-
- assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
- new int[] { 0, 9, 15 },
- new int[] { 6, 14, 19 },
- new int[] { 1, 1, 1 });
-
- /* analyzer that will consume tokens with large position increments */
- Analyzer a2 = new Analyzer() {
- public TokenStream tokenStream(String field, Reader reader) {
- return new WordDelimiterFilter(
- new LargePosIncTokenFilter(
- new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
- 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
- }
- };
-
- /* increment of "largegap" is preserved */
- assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
- new int[] { 0, 7, 16 },
- new int[] { 6, 15, 20 },
- new int[] { 1, 10, 1 });
-
- /* the "/" had a position increment of 10, where did it go?!?!! */
- assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
- new int[] { 0, 9 },
- new int[] { 6, 13 },
- new int[] { 1, 11 });
-
- /* in this case, the increment of 10 from the "/" is carried over */
- assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "R", "solR" },
- new int[] { 0, 9, 12, 9 },
- new int[] { 6, 12, 13, 13 },
- new int[] { 1, 11, 1, 0 });
-
- assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
- new int[] { 0, 9, 15 },
- new int[] { 6, 14, 19 },
- new int[] { 1, 11, 1 });
-
- Analyzer a3 = new Analyzer() {
- public TokenStream tokenStream(String field, Reader reader) {
- StopFilter filter = new StopFilter(DEFAULT_VERSION,
- new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
- filter.setEnablePositionIncrements(true);
- return new WordDelimiterFilter(filter,
- 1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
- }
- };
-
- assertAnalyzesTo(a3, "lucene.solr",
- new String[] { "lucene", "solr", "lucenesolr" },
- new int[] { 0, 7, 0 },
- new int[] { 6, 11, 11 },
- new int[] { 1, 1, 0 });
-
- /* the stopword should add a gap here */
- assertAnalyzesTo(a3, "the lucene.solr",
- new String[] { "lucene", "solr", "lucenesolr" },
- new int[] { 4, 11, 4 },
- new int[] { 10, 15, 15 },
- new int[] { 2, 1, 0 });
- }
}