You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/07/16 05:34:47 UTC
svn commit: r1361896 - in /lucene/dev/trunk/lucene/analysis:
common/src/test/org/apache/lucene/analysis/cjk/
common/src/test/org/apache/lucene/analysis/de/
common/src/test/org/apache/lucene/analysis/fi/
common/src/test/org/apache/lucene/analysis/fr/ co...
Author: rmuir
Date: Mon Jul 16 03:34:46 2012
New Revision: 1361896
URL: http://svn.apache.org/viewvc?rev=1361896&view=rev
Log:
test some untested analysis corner cases
Added:
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java (with props)
Modified:
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
Added: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java?rev=1361896&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java Mon Jul 16 03:34:46 2012
@@ -0,0 +1,67 @@
+package org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(t, new CJKBigramFilter(t));
+ }
+ };
+
+ public void testHuge() throws Exception {
+ assertAnalyzesTo(analyzer, "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã"
+ + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã"
+ + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã" + "å¤ãã®å¦çã試é¨ã«è½ã¡ã",
+ new String[] {
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã", "ãå¤",
+ "å¤ã", "ãã®", "ã®å¦", "å¦ç", "çã", "ã試", "試é¨", "é¨ã«", "ã«è½", "è½ã¡", "ã¡ã"
+ }
+ );
+ }
+
+ public void testHanOnly() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
+ }
+ };
+ assertAnalyzesTo(a, "å¤ãã®å¦çã試é¨ã«è½ã¡ãã",
+ new String[] { "å¤", "ã", "ã®", "å¦ç", "ã", "試é¨", "ã«", "è½", "ã¡", "ã" });
+ }
+}
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -46,6 +49,19 @@ public class TestGermanLightStemFilter e
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "sängerinnen", "sängerinnen");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -53,6 +56,19 @@ public class TestGermanMinimalStemFilter
checkOneTerm(analyzer, "äpfel", "apfel");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "sängerinnen", "sängerinnen");
+ }
+
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,9 +23,13 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -58,6 +62,19 @@ public class TestGermanStemFilter extend
assertAnalyzesTo(analyzer, "", new String[] { "" });
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sängerinnen"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new GermanStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "sängerinnen", "sängerinnen");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -46,6 +49,19 @@ public class TestFinnishLightStemFilter
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("edeltäjistään"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "edeltäjistään", "edeltäjistään");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -176,6 +179,19 @@ public class TestFrenchLightStemFilter e
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "chevaux", "chevaux");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -55,6 +58,19 @@ public class TestFrenchMinimalStemFilter
checkOneTerm(analyzer, "baron", "baron");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("chevaux"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "chevaux", "chevaux");
+ }
+
/** Test against a vocabulary from the reference impl */
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
/**
* Simple tests for {@link GalicianMinimalStemmer}
@@ -50,6 +53,19 @@ public class TestGalicianMinimalStemFilt
checkOneTerm(a, "barcelonês", "barcelonês");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("elefantes"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "elefantes", "elefantes");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -46,6 +49,19 @@ public class TestHungarianLightStemFilte
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("babakocsi"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "babakocsi", "babakocsi");
+ }
+
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java Mon Jul 16 03:34:46 2012
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
@@ -147,6 +148,12 @@ public class TestDutchStemmer extends Ba
checkOneTerm(a, "fiets", "fiets");
}
+ public void testEmptyStemDictionary() throws IOException {
+ DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET,
+ CharArraySet.EMPTY_SET, CharArrayMap.<String>emptyMap());
+ checkOneTerm(a, "fiets", "fiet");
+ }
+
/**
* Test that stopwords are not case sensitive
*/
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -25,8 +25,11 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -47,6 +50,19 @@ public class TestNorwegianLightStemFilte
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_light.txt")));
}
+
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "sekretæren", "sekretæren");
+ }
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilter.java Mon Jul 16 03:34:46 2012
@@ -25,8 +25,11 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -47,6 +50,19 @@ public class TestNorwegianMinimalStemFil
public void testVocabulary() throws IOException {
assertVocabulary(analyzer, new FileInputStream(getDataFile("nb_minimal.txt")));
}
+
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("sekretæren"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "sekretæren", "sekretæren");
+ }
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -22,11 +22,15 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -93,6 +97,19 @@ public class TestPortugueseLightStemFilt
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "quilométricas", "quilométricas");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilter.java Mon Jul 16 03:34:46 2012
@@ -22,11 +22,15 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -67,6 +71,19 @@ public class TestPortugueseMinimalStemFi
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "quilométricas", "quilométricas");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java Mon Jul 16 03:34:46 2012
@@ -24,11 +24,14 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
/**
* Simple tests for {@link PortugueseStemFilter}
@@ -67,6 +70,19 @@ public class TestPortugueseStemFilter ex
assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("quilométricas"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "quilométricas", "quilométricas");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -46,6 +49,19 @@ public class TestRussianLightStemFilter
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ÑнеÑгии"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "ÑнеÑгии", "ÑнеÑгии");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java Mon Jul 16 03:34:46 2012
@@ -23,8 +23,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import static org.apache.lucene.analysis.VocabularyAssert.*;
@@ -46,6 +49,19 @@ public class TestSwedishLightStemFilter
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("jaktkarlens"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "jaktkarlens", "jaktkarlens");
+ }
+
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java Mon Jul 16 03:34:46 2012
@@ -68,6 +68,13 @@ public class TestTurkishLowerCaseFilter
"\u0131\u0316sparta",});
}
+ public void testDecomposed3() throws Exception {
+ TokenStream stream = new MockTokenizer(new StringReader(
+ "\u0049\u0307"), MockTokenizer.WHITESPACE, false);
+ TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
+ assertTokenStreamContents(filter, new String[] {"i"});
+ }
+
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
Modified: lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Mon Jul 16 03:34:46 2012
@@ -20,8 +20,12 @@ package org.apache.lucene.analysis.icu.s
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
+
+import com.ibm.icu.lang.UScript;
import java.io.IOException;
import java.io.Reader;
@@ -243,4 +247,18 @@ public class TestICUTokenizer extends Ba
Random random = random();
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
}
+
+ public void testTokenAttributes() throws Exception {
+ TokenStream ts = a.tokenStream("dummy", new StringReader("This is a test"));
+ ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ assertEquals(UScript.LATIN, scriptAtt.getCode());
+ assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
+ assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
+ assertTrue(ts.reflectAsString(false).contains("script=Latin"));
+ }
+ ts.end();
+ ts.close();
+ }
}
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java Mon Jul 16 03:34:46 2012
@@ -22,8 +22,11 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@@ -40,6 +43,21 @@ public class TestJapaneseBaseFormFilter
);
}
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ãã"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
+ }
+ };
+ assertAnalyzesTo(a, "ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã",
+ new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" }
+ );
+ }
+
public void testEnglish() throws IOException {
assertAnalyzesTo(analyzer, "this atest",
new String[] { "this", "atest" });
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java?rev=1361896&r1=1361895&r2=1361896&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java Mon Jul 16 03:34:46 2012
@@ -20,8 +20,11 @@ package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
import java.io.Reader;
@@ -60,6 +63,19 @@ public class TestJapaneseKatakanaStemFil
new int[] { 0, 4, 9, 14, 20, 25 },
new int[] { 3, 8, 13, 19, 24, 29 });
}
+
+ public void testKeyword() throws IOException {
+ final CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("ã³ã¼ãã¼"), false);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
+ }
+ };
+ checkOneTerm(a, "ã³ã¼ãã¼", "ã³ã¼ãã¼");
+ }
public void testUnsupportedHalfWidthVariants() throws IOException {
// The below result is expected since only full-width katakana is supported