You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/02/26 14:10:08 UTC
svn commit: r916666 [4/16] - in /lucene/java/branches/flex_1458: ./ contrib/
contrib/analyzers/common/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/
contrib/analyzers/c...
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemmer.java Fri Feb 26 13:09:54 2010
@@ -17,12 +17,14 @@
* limitations under the License.
*/
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
+import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -139,6 +141,34 @@
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
}
+ public void testStemExclusionTableBWCompat() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("BrasÃlia");
+ BrazilianStemFilter filter = new BrazilianStemFilter(
+ new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader("BrasÃlia Brasilia")), set);
+ assertTokenStreamContents(filter, new String[] { "brasÃlia", "brasil" });
+ }
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("BrasÃlia");
+ BrazilianStemFilter filter = new BrazilianStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+ "BrasÃlia Brasilia")), set));
+ assertTokenStreamContents(filter, new String[] { "brasÃlia", "brasil" });
+ }
+
+ public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("BrasÃlia");
+ CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set1.add("Brasilia");
+ BrazilianStemFilter filter = new BrazilianStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+ "BrasÃlia Brasilia")), set), set1);
+ assertTokenStreamContents(filter, new String[] { "brasÃlia", "brasilia" });
+ }
+
/*
* Test that changes to the exclusion table are applied immediately
* when using reusable token streams.
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Fri Feb 26 13:09:54 2010
@@ -18,14 +18,9 @@
*/
import java.io.IOException;
-import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Version;
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
@@ -47,33 +42,33 @@
}
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
- CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
- TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
- TypeAttribute typeAtt = tokenizer.getAttribute(TypeAttribute.class);
+ Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
+ String terms[] = new String[out_tokens.length];
+ int startOffsets[] = new int[out_tokens.length];
+ int endOffsets[] = new int[out_tokens.length];
+ String types[] = new String[out_tokens.length];
for (int i = 0; i < out_tokens.length; i++) {
- assertTrue(tokenizer.incrementToken());
- assertEquals(termAtt.term(), out_tokens[i].termText);
- assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
- assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
- assertEquals(typeAtt.type(), out_tokens[i].type);
+ terms[i] = out_tokens[i].termText;
+ startOffsets[i] = out_tokens[i].start;
+ endOffsets[i] = out_tokens[i].end;
+ types[i] = out_tokens[i].type;
}
- assertFalse(tokenizer.incrementToken());
+ assertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
}
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
- TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str));
- TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
- TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
+ Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
+ String terms[] = new String[out_tokens.length];
+ int startOffsets[] = new int[out_tokens.length];
+ int endOffsets[] = new int[out_tokens.length];
+ String types[] = new String[out_tokens.length];
for (int i = 0; i < out_tokens.length; i++) {
- assertTrue(ts.incrementToken());
- assertEquals(termAtt.term(), out_tokens[i].termText);
- assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
- assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
- assertEquals(typeAtt.type(), out_tokens[i].type);
+ terms[i] = out_tokens[i].termText;
+ startOffsets[i] = out_tokens[i].start;
+ endOffsets[i] = out_tokens[i].end;
+ types[i] = out_tokens[i].type;
}
- assertFalse(ts.incrementToken());
+ assertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
}
public void testJa1() throws IOException {
@@ -219,13 +214,8 @@
public void testTokenStream() throws Exception {
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
- TokenStream ts = analyzer.tokenStream("dummy", new StringReader("\u4e00\u4e01\u4e02"));
- TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
- assertTrue(ts.incrementToken());
- assertEquals("\u4e00\u4e01", termAtt.term());
- assertTrue(ts.incrementToken());
- assertEquals("\u4e01\u4e02", termAtt.term());
- assertFalse(ts.incrementToken());
+ assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
+ new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
}
public void testReusableTokenStream() throws Exception {
@@ -261,4 +251,24 @@
};
checkCJKTokenReusable(analyzer, str, out_tokens2);
}
+
+ /**
+ * LUCENE-2207: wrong offset calculated by end()
+ */
+ public void testFinalOffset() throws IOException {
+ checkCJKToken("ãã", new TestToken[] {
+ newToken("ãã", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+ checkCJKToken("ãã ", new TestToken[] {
+ newToken("ãã", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+ checkCJKToken("test", new TestToken[] {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+ checkCJKToken("test ", new TestToken[] {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+ checkCJKToken("ããtest", new TestToken[] {
+ newToken("ãã", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
+ newToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE) });
+ checkCJKToken("testãã ", new TestToken[] {
+ newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
+ newToken("ãã", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
+ }
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java Fri Feb 26 13:09:54 2010
@@ -24,11 +24,13 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.Version;
+/** @deprecated Remove this test when ChineseAnalyzer is removed. */
+@Deprecated
public class TestChineseTokenizer extends BaseTokenStreamTestCase
{
public void testOtherLetterOffset() throws IOException
@@ -78,7 +80,7 @@
private class JustChineseFilterAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
- return new ChineseFilter(new WhitespaceTokenizer(reader));
+ return new ChineseFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader));
}
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -28,6 +28,7 @@
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
@@ -46,8 +47,8 @@
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
- HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
- new WhitespaceTokenizer(new StringReader(
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(Version.LUCENE_CURRENT,
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"min veninde som er lidt af en læsehest")), hyphenator,
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
@@ -66,8 +67,8 @@
.getHyphenationTree(reader);
// the word basket will not be added due to the longest match option
- HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
- new WhitespaceTokenizer(new StringReader(
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(Version.LUCENE_CURRENT,
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"basketballkurv")), hyphenator, dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
@@ -83,8 +84,8 @@
"Pelar", "Glas", "Ãgon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
"Sko", "Vind", "Rute", "Torkare", "Blad" };
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
- new WhitespaceTokenizer(
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT,
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT,
new StringReader(
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
dict);
@@ -112,8 +113,8 @@
"Pelar", "Glas", "Ãgon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
- new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")),
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT,
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("Basfiolsfodralmakaregesäll")),
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
@@ -128,9 +129,9 @@
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Ãberwachung" };
- Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader(
+ Tokenizer wsTokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"Rindfleischüberwachungsgesetz"));
- DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+ DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT,
wsTokenizer, dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -109,5 +110,11 @@
assertAnalyzesToReuse(cz, "Äeská Republika", new String[] { "Äeská" });
}
-
+
+ public void testWithStemExclusionSet() throws IOException{
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("hole");
+ CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
+ }
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemmer.java Fri Feb 26 13:09:54 2010
@@ -18,8 +18,12 @@
*/
import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
/**
@@ -270,4 +274,13 @@
assertAnalyzesTo(cz, "e", new String[] { "e" });
assertAnalyzesTo(cz, "zi", new String[] { "zi" });
}
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("hole");
+ CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerTokenFilter(
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hole desek")), set));
+ assertTokenStreamContents(filter, new String[] { "hole", "desk" });
+ }
+
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Fri Feb 26 13:09:54 2010
@@ -21,9 +21,13 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
+import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
/**
@@ -35,6 +39,8 @@
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testStemming() throws Exception {
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+ TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
// read test cases from external file:
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
@@ -50,36 +56,12 @@
continue; // ignore comments and empty lines
String[] parts = line.split(";");
//System.out.println(parts[0] + " -- " + parts[1]);
- check(parts[0], parts[1]);
+ tokenizer.reset(new StringReader(parts[0]));
+ filter.reset();
+ assertTokenStreamContents(filter, new String[] { parts[1] });
}
breader.close();
isr.close();
fis.close();
}
-
- public void testReusableTokenStream() throws Exception {
- Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "Tisch", "tisch");
- checkReuse(a, "Tische", "tisch");
- checkReuse(a, "Tischen", "tisch");
- }
-
- /*
- * Test that changes to the exclusion table are applied immediately
- * when using reusable token streams.
- */
- public void testExclusionTableReuse() throws Exception {
- GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "tischen", "tisch");
- a.setStemExclusionTable(new String[] { "tischen" });
- checkReuse(a, "tischen", "tischen");
- }
-
- private void check(final String input, final String expected) throws Exception {
- checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
- }
-
- private void checkReuse(Analyzer a, String input, String expected) throws Exception {
- checkOneTermReuse(a, input, expected);
- }
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java Fri Feb 26 13:09:54 2010
@@ -18,7 +18,6 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
@@ -63,4 +62,23 @@
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
}
+
+ /**
+ * Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
+ * check that this is preserved.
+ * @deprecated remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testAcronymBWCompat() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Î.Î .Τ.", new String[] { "α.Ï.Ï." });
+ }
+
+ /**
+ * test that acronym normalization works
+ */
+ public void testAcronym() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Î.Î .Τ.", new String[] { "αÏÏ" });
+ }
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java Fri Feb 26 13:09:54 2010
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import org.apache.lucene.util.Version;
/**
* Test the Persian Normalization Filter
@@ -54,7 +55,7 @@
}
private void check(final String input, final String expected) throws IOException {
- ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(Version.LUCENE_CURRENT,
new StringReader(input));
PersianNormalizationFilter filter = new PersianNormalizationFilter(
tokenStream);
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -17,11 +17,10 @@
* limitations under the License.
*/
-import java.io.StringReader;
+import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -116,6 +115,94 @@
}
+ /**
+ * @deprecated remove this test for Lucene 4.0
+ */
+ @Deprecated
+ public void testAnalyzer30() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
+
+ assertAnalyzesTo(fa, "", new String[] {
+ });
+
+ assertAnalyzesTo(
+ fa,
+ "chien chat cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ "chien CHAT CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ " chien ,? + = - CHAT /: > CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+ assertAnalyzesTo(
+ fa,
+ "mot \"entreguillemet\"",
+ new String[] { "mot", "entreguillemet" });
+
+ // let's do some french specific tests now
+
+ /* 1. couldn't resist
+ I would expect this to stay one term as in French the minus
+ sign is often used for composing words */
+ assertAnalyzesTo(
+ fa,
+ "Jean-François",
+ new String[] { "jean", "françois" });
+
+ // 2. stopwords
+ assertAnalyzesTo(
+ fa,
+ "le la chien les aux chat du des à cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ // some nouns and adjectives
+ assertAnalyzesTo(
+ fa,
+ "lances chismes habitable chiste éléments captifs",
+ new String[] {
+ "lanc",
+ "chism",
+ "habit",
+ "chist",
+ "élément",
+ "captif" });
+
+ // some verbs
+ assertAnalyzesTo(
+ fa,
+ "finissions souffrirent rugissante",
+ new String[] { "fin", "souffr", "rug" });
+
+ // some everything else
+ // aujourd'hui stays one term which is OK
+ assertAnalyzesTo(
+ fa,
+ "C3PO aujourd'hui oeuf ïâöûà ä anticonstitutionnellement Java++ ",
+ new String[] {
+ "c3po",
+ "aujourd'hui",
+ "oeuf",
+ "ïâöûà ä",
+ "anticonstitutionnel",
+ "jav" });
+
+ // some more everything else
+ // here 1940-1945 stays as one term, 1940:1945 not ?
+ assertAnalyzesTo(
+ fa,
+ "33Bis 1940-1945 1940:1945 (---i+++)*",
+ new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
+
+ }
+
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
// stopwords
@@ -147,4 +234,41 @@
fa.setStemExclusionTable(new String[] { "habitable" });
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
}
+
+ public void testExclusionTableViaCtor() throws Exception {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("habitable");
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT,
+ CharArraySet.EMPTY_SET, set);
+ assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
+ "chist" });
+
+ fa = new FrenchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
+ "chist" });
+ }
+
+ public void testElision() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
+ assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+ }
+
+ /**
+ * Prior to 3.1, this analyzer had no lowercase filter.
+ * stopwords were case sensitive. Preserve this for back compat.
+ * @deprecated Remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testBuggyStopwordsCasing() throws IOException {
+ FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Votre", new String[] { "votr" });
+ }
+
+ /**
+ * Test that stopwords are not case sensitive
+ */
+ public void testStopwordsCasing() throws IOException {
+ FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Votre", new String[] { });
+ }
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -19,10 +19,8 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@@ -33,25 +31,13 @@
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
new SingleTokenTokenStream(createToken("^", 0, 0)),
- new WhitespaceTokenizer(new StringReader("hello world")),
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hello world")),
new SingleTokenTokenStream(createToken("$", 0, 0)));
- assertNext(ts, "^", 0, 0);
- assertNext(ts, "hello", 0, 5);
- assertNext(ts, "world", 6, 11);
- assertNext(ts, "$", 11, 11);
- assertFalse(ts.incrementToken());
- }
-
-
- private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
- TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
-
- assertTrue(ts.incrementToken());
- assertEquals(text, termAtt.term());
- assertEquals(startOffset, offsetAtt.startOffset());
- assertEquals(endOffset, offsetAtt.endOffset());
+ assertTokenStreamContents(ts,
+ new String[] { "^", "hello", "world", "$" },
+ new int[] { 0, 0, 6, 11 },
+ new int[] { 0, 5, 11, 11 });
}
private static Token createToken(String term, int start, int offset)
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java Fri Feb 26 13:09:54 2010
@@ -19,10 +19,8 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@@ -36,31 +34,21 @@
ts = new PrefixAwareTokenFilter(
new SingleTokenTokenStream(createToken("a", 0, 1)),
new SingleTokenTokenStream(createToken("b", 0, 1)));
- assertNext(ts, "a", 0, 1);
- assertNext(ts, "b", 1, 2);
- assertFalse(ts.incrementToken());
+ assertTokenStreamContents(ts,
+ new String[] { "a", "b" },
+ new int[] { 0, 1 },
+ new int[] { 1, 2 });
// prefix and suffix using 2x prefix
- ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
+ ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)),
+ new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hello world")));
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
- assertNext(ts, "^", 0, 0);
- assertNext(ts, "hello", 0, 5);
- assertNext(ts, "world", 6, 11);
- assertNext(ts, "$", 11, 11);
- assertFalse(ts.incrementToken());
- }
-
-
- private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
- TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
-
- assertTrue(ts.incrementToken());
- assertEquals(text, termAtt.term());
- assertEquals(startOffset, offsetAtt.startOffset());
- assertEquals(endOffset, offsetAtt.endOffset());
+ assertTokenStreamContents(ts,
+ new String[] { "^", "hello", "world", "$" },
+ new int[] { 0, 0, 6, 11 },
+ new int[] { 0, 5, 11, 11 });
}
private static Token createToken(String term, int start, int offset)
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
import java.io.StringReader;
@@ -32,7 +33,7 @@
@Override
public void setUp() throws Exception {
super.setUp();
- input = new WhitespaceTokenizer(new StringReader("abcde"));
+ input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
}
public void testInvalidInput() throws Exception {
@@ -91,13 +92,13 @@
}
public void testSmallTokenInStream() throws Exception {
- input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+ input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
- assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
+ assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {
- WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
tokenizer.reset(new StringReader("abcde"));
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -66,33 +66,33 @@
public void testFrontUnigram() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1});
+ assertTokenStreamContents(tokenizer, new String[]{"a"}, new int[]{0}, new int[]{1}, 5 /* abcde */);
}
public void testBackUnigram() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5});
+ assertTokenStreamContents(tokenizer, new String[]{"e"}, new int[]{4}, new int[]{5}, 5 /* abcde */);
}
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
- assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+ assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}
public void testFrontRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+ assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
}
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5});
+ assertTokenStreamContents(tokenizer, new String[]{"e","de","cde"}, new int[]{4,3,2}, new int[]{5,5,5}, 5 /* abcde */);
}
public void testReset() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
- assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+ assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
tokenizer.reset(new StringReader("abcde"));
- assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
+ assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
}
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -20,8 +20,8 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
-import java.io.IOException;
import java.io.StringReader;
/**
@@ -33,7 +33,7 @@
@Override
public void setUp() throws Exception {
super.setUp();
- input = new WhitespaceTokenizer(new StringReader("abcde"));
+ input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
}
public void testInvalidInput() throws Exception {
@@ -81,13 +81,13 @@
}
public void testSmallTokenInStream() throws Exception {
- input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+ input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
- assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,0}, new int[]{3,3});
+ assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
}
public void testReset() throws Exception {
- WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
tokenizer.reset(new StringReader("abcde"));
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Feb 26 13:09:54 2010
@@ -18,10 +18,8 @@
*/
-import java.io.IOException;
import java.io.StringReader;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
/**
@@ -58,12 +56,12 @@
public void testUnigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
public void testBigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
- assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
}
public void testNgrams() throws Exception {
@@ -71,19 +69,20 @@
assertTokenStreamContents(tokenizer,
new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"},
new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
- new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
+ new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+ 5 /* abcde */
);
}
public void testOversizedNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
- assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+ assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
}
public void testReset() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
tokenizer.reset(new StringReader("abcde"));
- assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+ assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
}
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Fri Feb 26 13:09:54 2010
@@ -18,9 +18,11 @@
*/
import java.io.File;
+import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
/**
@@ -98,9 +100,6 @@
check("ophalend", "ophal");
check("ophalers", "ophaler");
check("ophef", "ophef");
- check("opheffen", "ophef"); // versus snowball 'opheff'
- check("opheffende", "ophef"); // versus snowball 'opheff'
- check("opheffing", "ophef"); // versus snowball 'opheff'
check("opheldering", "ophelder");
check("ophemelde", "ophemeld");
check("ophemelen", "ophemel");
@@ -116,6 +115,24 @@
check("ophouden", "ophoud");
}
+ /**
+ * @deprecated remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testOldBuggyStemmer() throws Exception {
+ Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
+ checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
+ checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
+ checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
+ }
+
+ public void testSnowballCorrectness() throws Exception {
+ Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
+ checkOneTermReuse(a, "opheffen", "opheff");
+ checkOneTermReuse(a, "opheffende", "opheff");
+ checkOneTermReuse(a, "opheffing", "opheff");
+ }
+
public void testReusableTokenStream() throws Exception {
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
@@ -133,6 +150,19 @@
checkOneTermReuse(a, "lichamelijk", "licham");
a.setStemExclusionTable(new String[] { "lichamelijk" });
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
+
+
+ }
+
+ public void testExclusionTableViaCtor() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
+ set.add("lichamelijk");
+ DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+
+ a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
+ assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
+
}
/*
@@ -146,6 +176,25 @@
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
}
+ /**
+ * Prior to 3.1, this analyzer had no lowercase filter.
+ * stopwords were case sensitive. Preserve this for back compat.
+ * @deprecated Remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testBuggyStopwordsCasing() throws IOException {
+ DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
+ }
+
+ /**
+ * Test that stopwords are not case sensitive
+ */
+ public void testStopwordsCasing() throws IOException {
+ DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Zelf", new String[] { });
+ }
+
private void check(final String input, final String expected) throws Exception {
checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected);
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.Version;
import java.io.StringReader;
@@ -35,7 +36,7 @@
public void testPayloads() throws Exception {
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
- (new WhitespaceTokenizer(new StringReader(test)),
+ (new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)),
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
@@ -56,7 +57,7 @@
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
- (new WhitespaceTokenizer(new StringReader(test)),
+ (new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)),
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
assertTermEquals("The", filter, null);
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
@@ -74,7 +75,7 @@
public void testFloatEncoding() throws Exception {
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
- DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)), '|', new FloatEncoder());
+ DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), '|', new FloatEncoder());
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
assertTermEquals("The", filter, termAtt, payAtt, null);
@@ -92,7 +93,7 @@
public void testIntEncoding() throws Exception {
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
- DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)), '|', new IntegerEncoder());
+ DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
assertTermEquals("The", filter, termAtt, payAtt, null);
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -23,6 +23,7 @@
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@@ -37,7 +38,7 @@
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
- NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
+ NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))), 3, "D");
boolean seenDogs = false;
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@@ -35,7 +36,7 @@
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
- TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
+ TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
int count = 0;
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java Fri Feb 26 13:09:54 2010
@@ -23,6 +23,7 @@
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@@ -37,7 +38,7 @@
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
- TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
+ TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))));
int count = 0;
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java Fri Feb 26 13:09:54 2010
@@ -51,7 +51,7 @@
protected void setUp() throws Exception {
super.setUp();
dir = new RAMDirectory();
- appAnalyzer = new WhitespaceAnalyzer();
+ appAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
int numDocs = 200;
for (int i = 0; i < numDocs; i++) {
@@ -157,9 +157,9 @@
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
if (++invocationCount % 2 == 0)
- return new WhitespaceTokenizer(reader);
+ return new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
else
- return new LetterTokenizer(reader);
+ return new LetterTokenizer(Version.LUCENE_CURRENT, reader);
}
}
@@ -173,7 +173,7 @@
}
public void testTokenStream() throws Exception {
- QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer());
+ QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
a.addStopWords(reader, 10);
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java Fri Feb 26 13:09:54 2010
@@ -27,9 +27,9 @@
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(
+ TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
new StringReader("Do have a nice day")); // 1-4 length string
- ReverseStringFilter filter = new ReverseStringFilter(stream);
+ ReverseStringFilter filter = new ReverseStringFilter(Version.LUCENE_CURRENT, stream);
TermAttribute text = filter.getAttribute(TermAttribute.class);
assertTrue(filter.incrementToken());
assertEquals("oD", text.term());
@@ -45,9 +45,9 @@
}
public void testFilterWithMark() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(new StringReader(
+ TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
"Do have a nice day")); // 1-4 length string
- ReverseStringFilter filter = new ReverseStringFilter(stream, '\u0001');
+ ReverseStringFilter filter = new ReverseStringFilter(Version.LUCENE_CURRENT, stream, '\u0001');
TermAttribute text = filter
.getAttribute(TermAttribute.class);
assertTrue(filter.incrementToken());
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java Fri Feb 26 13:09:54 2010
@@ -26,6 +26,7 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
@@ -49,9 +50,14 @@
dataDir = new File(System.getProperty("dataDir", "./bin"));
}
- public void testUnicode() throws IOException
+ /**
+ * @deprecated remove this test and its datafiles in Lucene 4.0
+ * the Snowball version has its own data tests.
+ */
+ @Deprecated
+ public void testUnicode30() throws IOException
{
- RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
+ RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30);
inWords =
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@@ -65,7 +71,7 @@
TokenStream in = ra.tokenStream("all", inWords);
RussianLetterTokenizer sample =
- new RussianLetterTokenizer(
+ new RussianLetterTokenizer(Version.LUCENE_CURRENT,
sampleUnicode);
TermAttribute text = in.getAttribute(TermAttribute.class);
@@ -109,11 +115,31 @@
}
}
+ /** @deprecated remove this test in Lucene 4.0: stopwords changed */
+ @Deprecated
+ public void testReusableTokenStream30() throws Exception {
+ Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
+ assertAnalyzesToReuse(a, "ÐмеÑÑе Ñ Ñем о Ñиле ÑлекÑÑомагниÑной ÑнеÑгии имели пÑедÑÑавление еÑе",
+ new String[] { "вмеÑÑ", "Ñил", "ÑлекÑÑомагниÑн", "ÑнеÑг", "имел", "пÑедÑÑавлен" });
+ assertAnalyzesToReuse(a, "Ðо знание ÑÑо Ñ
ÑанилоÑÑ Ð² Ñайне",
+ new String[] { "знан", "Ñ
Ñан", "Ñайн" });
+ }
+
public void testReusableTokenStream() throws Exception {
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesToReuse(a, "ÐмеÑÑе Ñ Ñем о Ñиле ÑлекÑÑомагниÑной ÑнеÑгии имели пÑедÑÑавление еÑе",
new String[] { "вмеÑÑ", "Ñил", "ÑлекÑÑомагниÑн", "ÑнеÑг", "имел", "пÑедÑÑавлен" });
assertAnalyzesToReuse(a, "Ðо знание ÑÑо Ñ
ÑанилоÑÑ Ð² Ñайне",
- new String[] { "знан", "Ñ
Ñан", "Ñайн" });
+ new String[] { "знан", "ÑÑ", "Ñ
Ñан", "Ñайн" });
+ }
+
+
+ public void testWithStemExclusionSet() throws Exception {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("пÑедÑÑавление");
+ Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
+ assertAnalyzesToReuse(a, "ÐмеÑÑе Ñ Ñем о Ñиле ÑлекÑÑомагниÑной ÑнеÑгии имели пÑедÑÑавление еÑе",
+ new String[] { "вмеÑÑ", "Ñил", "ÑлекÑÑомагниÑн", "ÑнеÑг", "имел", "пÑедÑÑавление" });
+
}
}
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java Fri Feb 26 13:09:54 2010
@@ -24,6 +24,10 @@
import java.io.FileInputStream;
import java.util.ArrayList;
+/**
+ * @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
+ */
+@Deprecated
public class TestRussianStem extends LuceneTestCase
{
private ArrayList words = new ArrayList();
Modified: lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=916666&r1=916665&r2=916666&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Fri Feb 26 13:09:54 2010
@@ -106,7 +106,7 @@
*/
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
- (new WhitespaceAnalyzer(), 2),
+ (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
"test sentence");
int[] ranks = new int[] { 1, 2, 0 };
compareRanks(hits, ranks);
@@ -117,7 +117,7 @@
*/
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
- (new WhitespaceAnalyzer(), 2),
+ (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
"\"this sentence\"");
int[] ranks = new int[] { 0 };
compareRanks(hits, ranks);
@@ -128,7 +128,7 @@
*/
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
- (new WhitespaceAnalyzer(), 2),
+ (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
"\"test sentence\"");
int[] ranks = new int[] { 1 };
compareRanks(hits, ranks);
@@ -139,7 +139,7 @@
*/
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
- (new WhitespaceAnalyzer(), 2),
+ (new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
"+test +sentence");
int[] ranks = new int[] { 1, 2 };
compareRanks(hits, ranks);
@@ -149,7 +149,7 @@
* This shows how to construct a phrase query containing shingles.
*/
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
- Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+ Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
searcher = setUpSearcher(analyzer);
PhraseQuery q = new PhraseQuery();
@@ -178,7 +178,7 @@
* in the right order and adjacent to each other.
*/
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
- Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+ Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
searcher = setUpSearcher(analyzer);
BooleanQuery q = new BooleanQuery();
@@ -200,7 +200,7 @@
}
public void testReusableTokenStream() throws Exception {
- Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+ Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
assertAnalyzesToReuse(a, "please divide into shingles",
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
new int[] { 0, 0, 7, 7, 14, 14, 19 },
@@ -222,9 +222,9 @@
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
if (++invocationCount % 2 == 0)
- return new WhitespaceTokenizer(reader);
+ return new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
else
- return new LetterTokenizer(reader);
+ return new LetterTokenizer(Version.LUCENE_CURRENT, reader);
}
}
@@ -246,4 +246,117 @@
new int[] { 6, 13, 13, 18, 18, 27, 27 },
new int[] { 1, 0, 1, 0, 1, 0, 1 });
}
+
+ public void testNonDefaultMinShingleSize() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please", "please divide this", "please divide this sentence",
+ "divide", "divide this sentence", "divide this sentence into",
+ "this", "this sentence into", "this sentence into shingles",
+ "sentence", "sentence into shingles",
+ "into",
+ "shingles" },
+ new int[] { 0, 0, 0, 7, 7, 7, 14, 14, 14, 19, 19, 28, 33 },
+ new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
+ new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please divide this", "please divide this sentence",
+ "divide this sentence", "divide this sentence into",
+ "this sentence into", "this sentence into shingles",
+ "sentence into shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 18, 27, 27, 32, 32, 41, 41 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ }
+
+ public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please", "please divide this",
+ "divide", "divide this sentence",
+ "this", "this sentence into",
+ "sentence", "sentence into shingles",
+ "into",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19, 19, 28, 33 },
+ new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+ new String[] { "please divide this",
+ "divide this sentence",
+ "this sentence into",
+ "sentence into shingles" },
+ new int[] { 0, 7, 14, 19 },
+ new int[] { 18, 27, 32, 41 },
+ new int[] { 1, 1, 1, 1 });
+ }
+
+ public void testNoTokenSeparator() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+ analyzer.setTokenSeparator("");
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "please", "pleasedivide",
+ "divide", "divideinto",
+ "into", "intoshingles",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 6, 13, 13, 18, 18, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "pleasedivide",
+ "divideinto",
+ "intoshingles" },
+ new int[] { 0, 7, 14 },
+ new int[] { 13, 18, 27 },
+ new int[] { 1, 1, 1 });
+ }
+
+ public void testNullTokenSeparator() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+ analyzer.setTokenSeparator(null);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "please", "pleasedivide",
+ "divide", "divideinto",
+ "into", "intoshingles",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 6, 13, 13, 18, 18, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "pleasedivide",
+ "divideinto",
+ "intoshingles" },
+ new int[] { 0, 7, 14 },
+ new int[] { 13, 18, 27 },
+ new int[] { 1, 1, 1 });
+ }
+ public void testAltTokenSeparator() throws Exception {
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+ analyzer.setTokenSeparator("<SEP>");
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "please", "please<SEP>divide",
+ "divide", "divide<SEP>into",
+ "into", "into<SEP>shingles",
+ "shingles" },
+ new int[] { 0, 0, 7, 7, 14, 14, 19 },
+ new int[] { 6, 13, 13, 18, 18, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 0, 1 });
+ analyzer.setOutputUnigrams(false);
+ assertAnalyzesToReuse(analyzer, "please divide into shingles",
+ new String[] { "please<SEP>divide",
+ "divide<SEP>into",
+ "into<SEP>shingles" },
+ new int[] { 0, 7, 14 },
+ new int[] { 13, 18, 27 },
+ new int[] { 1, 1, 1 });
+ }
}