You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/05/30 09:53:46 UTC
svn commit: r1487777 [13/50] - in /lucene/dev/branches/security: ./
dev-tools/ dev-tools/eclipse/dot.settings/ dev-tools/idea/.idea/
dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/replicator/
dev-tools/maven/ dev-tools/maven/lucene/ dev-tools/ma...
Modified: lucene/dev/branches/security/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java Thu May 30 07:53:18 2013
@@ -25,11 +25,10 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.IOUtils;
import com.ibm.icu.lang.UCharacter;
@@ -75,21 +74,17 @@ import com.ibm.icu.text.RuleBasedBreakIt
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
static final String RULEFILES = "rulefiles";
- private Map<Integer,String> tailored;
+ private final Map<Integer,String> tailored;
private ICUTokenizerConfig config;
- /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
- public ICUTokenizerFactory() {}
-
- @Override
- public void init(Map<String,String> args) {
- super.init(args);
+ /** Creates a new ICUTokenizerFactory */
+ public ICUTokenizerFactory(Map<String,String> args) {
+ super(args);
tailored = new HashMap<Integer,String>();
- String rulefilesArg = args.get(RULEFILES);
+ String rulefilesArg = get(args, RULEFILES);
if (rulefilesArg != null) {
List<String> scriptAndResourcePaths = splitFileNames(rulefilesArg);
for (String scriptAndResourcePath : scriptAndResourcePaths) {
@@ -99,6 +94,9 @@ public class ICUTokenizerFactory extends
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
}
}
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
}
@Override
@@ -144,8 +142,8 @@ public class ICUTokenizerFactory extends
}
@Override
- public Tokenizer create(Reader input) {
+ public ICUTokenizer create(AttributeFactory factory, Reader input) {
assert config != null : "inform must be called first!";
- return new ICUTokenizer(input, config);
+ return new ICUTokenizer(factory, input, config);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java Thu May 30 07:53:18 2013
@@ -19,11 +19,11 @@ package org.apache.lucene.analysis.icu;
import java.io.Reader;
import java.io.StringReader;
+import java.util.HashMap;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.MockTokenizer;
/** basic tests for {@link ICUFoldingFilterFactory} */
public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
@@ -31,10 +31,21 @@ public class TestICUFoldingFilterFactory
/** basic tests to ensure the folding is working */
public void test() throws Exception {
Reader reader = new StringReader("Résumé");
- ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory();
- factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
- TokenStream stream = factory.create(tokenizer);
+ ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(new HashMap<String,String>());
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "resume" });
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new ICUFoldingFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java Thu May 30 07:53:18 2013
@@ -19,13 +19,11 @@ package org.apache.lucene.analysis.icu;
import java.io.Reader;
import java.io.StringReader;
-import java.util.Collections;
-import java.util.Map;
+import java.util.HashMap;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/** basic tests for {@link ICUNormalizer2FilterFactory} */
public class TestICUNormalizer2FilterFactory extends BaseTokenStreamTestCase {
@@ -33,14 +31,23 @@ public class TestICUNormalizer2FilterFac
/** Test nfkc_cf defaults */
public void testDefaults() throws Exception {
Reader reader = new StringReader("This is a ï¼´ï½
ï½ï½");
- ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory();
- factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- Map<String, String> args = Collections.emptyMap();
- factory.init(args);
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
- TokenStream stream = factory.create(tokenizer);
+ ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(new HashMap<String,String>());
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" });
}
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new ICUNormalizer2FilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
+
// TODO: add tests for different forms
}
Modified: lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java Thu May 30 07:53:18 2013
@@ -23,9 +23,8 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/** basic tests for {@link ICUTransformFilterFactory} */
public class TestICUTransformFilterFactory extends BaseTokenStreamTestCase {
@@ -33,33 +32,48 @@ public class TestICUTransformFilterFacto
/** ensure the transform is working */
public void test() throws Exception {
Reader reader = new StringReader("ç°¡åå");
- ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("id", "Traditional-Simplified");
- factory.init(args);
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
- TokenStream stream = factory.create(tokenizer);
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory(args);
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "ç®åå" });
}
/** test forward and reverse direction */
- public void testDirection() throws Exception {
+ public void testForwardDirection() throws Exception {
// forward
Reader reader = new StringReader("РоÑÑийÑÐºÐ°Ñ Ð¤ÐµÐ´ÐµÑаÑиÑ");
- ICUTransformFilterFactory factory = new ICUTransformFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("id", "Cyrillic-Latin");
- factory.init(args);
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
- TokenStream stream = factory.create(tokenizer);
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory(args);
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" });
-
+ }
+
+ public void testReverseDirection() throws Exception {
// backward (invokes Latin-Cyrillic)
- reader = new StringReader("Rossijskaâ Federaciâ");
+ Reader reader = new StringReader("Rossijskaâ Federaciâ");
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("id", "Cyrillic-Latin");
args.put("direction", "reverse");
- factory.init(args);
- tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
- stream = factory.create(tokenizer);
+ ICUTransformFilterFactory factory = new ICUTransformFilterFactory(args);
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "РоÑÑийÑкаÑ", "ФедеÑаÑиÑ" });
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new ICUTransformFilterFactory(new HashMap<String,String>() {{
+ put("id", "Null");
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java Thu May 30 07:53:18 2013
@@ -30,8 +30,7 @@ import org.apache.lucene.analysis.util.C
public class TestICUTokenizerFactory extends BaseTokenStreamTestCase {
public void testMixedText() throws Exception {
Reader reader = new StringReader("à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ This is a test àºàº§à»àº²àºàºàº");
- ICUTokenizerFactory factory = new ICUTokenizerFactory();
- factory.init(new HashMap<String,String>());
+ ICUTokenizerFactory factory = new ICUTokenizerFactory(new HashMap<String,String>());
factory.inform(new ClasspathResourceLoader(getClass()));
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream,
@@ -43,10 +42,9 @@ public class TestICUTokenizerFactory ext
// â U+201C LEFT DOUBLE QUOTATION MARK; â U+201D RIGHT DOUBLE QUOTATION MARK
Reader reader = new StringReader
(" Don't,break.at?/(punct)! \u201Cnice\u201D\r\n\r\n85_At:all; `really\" +2=3$5,&813 !@#%$^)(*@#$ ");
- ICUTokenizerFactory factory = new ICUTokenizerFactory();
final Map<String,String> args = new HashMap<String,String>();
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-break-only-on-whitespace.rbbi");
- factory.init(args);
+ ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(this.getClass()));
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream,
@@ -57,10 +55,9 @@ public class TestICUTokenizerFactory ext
public void testTokenizeLatinDontBreakOnHyphens() throws Exception {
Reader reader = new StringReader
("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish.");
- ICUTokenizerFactory factory = new ICUTokenizerFactory();
final Map<String,String> args = new HashMap<String,String>();
args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi");
- factory.init(args);
+ ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(getClass()));
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream,
@@ -77,10 +74,9 @@ public class TestICUTokenizerFactory ext
public void testKeywordTokenizeCyrillicAndThai() throws Exception {
Reader reader = new StringReader
("Some English. Ðемного ÑÑÑÑкий. à¸à¹à¸à¸à¸§à¸²à¸¡à¸ าษาà¹à¸à¸¢à¹à¸¥à¹à¸ ๠à¸à¹à¸à¸¢ ๠More English.");
- ICUTokenizerFactory factory = new ICUTokenizerFactory();
final Map<String,String> args = new HashMap<String,String>();
args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi");
- factory.init(args);
+ ICUTokenizerFactory factory = new ICUTokenizerFactory(args);
factory.inform(new ClasspathResourceLoader(getClass()));
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] { "Some", "English",
@@ -88,4 +84,16 @@ public class TestICUTokenizerFactory ext
"à¸à¹à¸à¸à¸§à¸²à¸¡à¸ าษาà¹à¸à¸¢à¹à¸¥à¹à¸ ๠à¸à¹à¸à¸¢ ๠",
"More", "English" });
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new ICUTokenizerFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java Thu May 30 07:53:18 2013
@@ -89,7 +89,7 @@ public class JapaneseAnalyzer extends St
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
- stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
+ stream = new JapanesePartOfSpeechStopFilter(matchVersion, stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords);
stream = new JapaneseKatakanaStemFilter(stream);
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilter.java Thu May 30 07:53:18 2013
@@ -22,7 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
@@ -32,7 +32,7 @@ import org.apache.lucene.analysis.tokena
* This acts as a lemmatizer for verbs and adjectives.
* <p>
* To prevent terms from being stemmed use an instance of
- * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * {@link SetKeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ja;
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -34,6 +36,14 @@ import org.apache.lucene.analysis.util.T
*/
public class JapaneseBaseFormFilterFactory extends TokenFilterFactory {
+ /** Creates a new JapaneseBaseFormFilterFactory */
+ public JapaneseBaseFormFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public TokenStream create(TokenStream input) {
return new JapaneseBaseFormFilter(input);
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java Thu May 30 07:53:18 2013
@@ -39,12 +39,20 @@ import java.util.Map;
public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent {
private static final String NORMALIZE_KANJI_PARAM = "normalizeKanji";
-
private static final String NORMALIZE_KANA_PARAM = "normalizeKana";
- private boolean normalizeKanji = true;
-
- private boolean normalizeKana = true;
+ private final boolean normalizeKanji;
+ private final boolean normalizeKana;
+
+ /** Creates a new JapaneseIterationMarkCharFilterFactory */
+ public JapaneseIterationMarkCharFilterFactory(Map<String,String> args) {
+ super(args);
+ normalizeKanji = getBoolean(args, NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
+ normalizeKana = getBoolean(args, NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
@Override
public CharFilter create(Reader input) {
@@ -52,13 +60,6 @@ public class JapaneseIterationMarkCharFi
}
@Override
- public void init(Map<String, String> args) {
- super.init(args);
- normalizeKanji = getBoolean(NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
- normalizeKana = getBoolean(NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
- }
-
- @Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java Thu May 30 07:53:18 2013
@@ -35,7 +35,7 @@ import java.io.IOException;
* </p>
* <p>
* In order to prevent terms from being stemmed, use an instance of
- * {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
+ * {@link org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter}
* or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
* before this {@link TokenStream}.
* </p>
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java Thu May 30 07:53:18 2013
@@ -37,15 +37,18 @@ import java.util.Map;
*/
public class JapaneseKatakanaStemFilterFactory extends TokenFilterFactory {
private static final String MINIMUM_LENGTH_PARAM = "minimumLength";
- private int minimumLength;
+ private final int minimumLength;
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- minimumLength = getInt(MINIMUM_LENGTH_PARAM, JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
+ /** Creates a new JapaneseKatakanaStemFilterFactory */
+ public JapaneseKatakanaStemFilterFactory(Map<String,String> args) {
+ super(args);
+ minimumLength = getInt(args, MINIMUM_LENGTH_PARAM, JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
if (minimumLength < 2) {
throw new IllegalArgumentException("Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)");
}
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java Thu May 30 07:53:18 2013
@@ -22,6 +22,7 @@ import java.util.Set;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
/**
* Removes tokens that match a set of part-of-speech tags.
@@ -30,8 +31,14 @@ public final class JapanesePartOfSpeechS
private final Set<String> stopTags;
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
- public JapanesePartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
- super(enablePositionIncrements, input);
+ /**
+ * Create a new {@link JapanesePartOfSpeechStopFilter}.
+ * @param version the Lucene match version
+ * @param input the {@link TokenStream} to consume
+ * @param stopTags the part-of-speech tags that should be removed
+ */
+ public JapanesePartOfSpeechStopFilter(Version version, TokenStream input, Set<String> stopTags) {
+ super(version, input);
this.stopTags = stopTags;
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java Thu May 30 07:53:18 2013
@@ -19,11 +19,14 @@ package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
-import org.apache.lucene.analysis.util.*;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter}.
@@ -32,20 +35,26 @@ import org.apache.lucene.analysis.util.*
* <analyzer>
* <tokenizer class="solr.JapaneseTokenizerFactory"/>
* <filter class="solr.JapanesePartOfSpeechStopFilterFactory"
- * tags="stopTags.txt"
- * enablePositionIncrements="true"/>
+ * tags="stopTags.txt"/>
* </analyzer>
* </fieldType>
* </pre>
*/
-public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
- private boolean enablePositionIncrements;
+public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+ private final String stopTagFiles;
private Set<String> stopTags;
+ /** Creates a new JapanesePartOfSpeechStopFilterFactory */
+ public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
+ super(args);
+ stopTagFiles = get(args, "tags");
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public void inform(ResourceLoader loader) throws IOException {
- String stopTagFiles = args.get("tags");
- enablePositionIncrements = getBoolean("enablePositionIncrements", false);
stopTags = null;
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
if (cas != null) {
@@ -60,6 +69,11 @@ public class JapanesePartOfSpeechStopFil
@Override
public TokenStream create(TokenStream stream) {
// if stoptags is null, it means the file is empty
- return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
+ if (stopTags != null) {
+ final TokenStream filter = new JapanesePartOfSpeechStopFilter(luceneMatchVersion, stream, stopTags);
+ return filter;
+ } else {
+ return stream;
+ }
}
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java Thu May 30 07:53:18 2013
@@ -37,12 +37,15 @@ import java.util.Map;
*/
public class JapaneseReadingFormFilterFactory extends TokenFilterFactory {
private static final String ROMAJI_PARAM = "useRomaji";
- private boolean useRomaji;
+ private final boolean useRomaji;
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
- useRomaji = getBoolean(ROMAJI_PARAM, false);
+ /** Creates a new JapaneseReadingFormFilterFactory */
+ public JapaneseReadingFormFilterFactory(Map<String,String> args) {
+ super(args);
+ useRomaji = getBoolean(args, ROMAJI_PARAM, false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
}
@Override
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java Thu May 30 07:53:18 2013
@@ -187,6 +187,8 @@ public final class JapaneseTokenizer ext
/**
* Create a new JapaneseTokenizer.
+ * <p>
+ * Uses the default AttributeFactory.
*
* @param input Reader containing text
* @param userDictionary Optional: if non-null, user dictionary.
@@ -194,7 +196,21 @@ public final class JapaneseTokenizer ext
* @param mode tokenization mode.
*/
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
- super(input);
+ this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
+ }
+
+ /**
+ * Create a new JapaneseTokenizer.
+ *
+ * @param factory the AttributeFactory to use
+ * @param input Reader containing text
+ * @param userDictionary Optional: if non-null, user dictionary.
+ * @param discardPunctuation true if punctuation tokens should be dropped from the output.
+ * @param mode tokenization mode.
+ */
+ public JapaneseTokenizer
+ (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
+ super(factory, input);
dictionary = TokenInfoDictionary.getInstance();
fst = dictionary.getFST();
unkDictionary = UnknownDictionary.getInstance();
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java Thu May 30 07:53:18 2013
@@ -27,11 +27,10 @@ import java.nio.charset.CodingErrorActio
import java.util.Locale;
import java.util.Map;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
@@ -63,17 +62,28 @@ public class JapaneseTokenizerFactory ex
private UserDictionary userDictionary;
- private Mode mode;
-
- private boolean discardPunctuation;
-
+ private final Mode mode;
+ private final boolean discardPunctuation;
+ private final String userDictionaryPath;
+ private final String userDictionaryEncoding;
+
+ /** Creates a new JapaneseTokenizerFactory */
+ public JapaneseTokenizerFactory(Map<String,String> args) {
+ super(args);
+ mode = Mode.valueOf(get(args, MODE, JapaneseTokenizer.DEFAULT_MODE.toString()).toUpperCase(Locale.ROOT));
+ userDictionaryPath = args.remove(USER_DICT_PATH);
+ userDictionaryEncoding = args.remove(USER_DICT_ENCODING);
+ discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
@Override
public void inform(ResourceLoader loader) throws IOException {
- mode = getMode(args);
- String userDictionaryPath = args.get(USER_DICT_PATH);
if (userDictionaryPath != null) {
InputStream stream = loader.openResource(userDictionaryPath);
- String encoding = args.get(USER_DICT_ENCODING);
+ String encoding = userDictionaryEncoding;
if (encoding == null) {
encoding = IOUtils.UTF_8;
}
@@ -85,20 +95,10 @@ public class JapaneseTokenizerFactory ex
} else {
userDictionary = null;
}
- discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
}
@Override
- public Tokenizer create(Reader input) {
- return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
- }
-
- private Mode getMode(Map<String, String> args) {
- String mode = args.get(MODE);
- if (mode != null) {
- return Mode.valueOf(mode.toUpperCase(Locale.ROOT));
- } else {
- return JapaneseTokenizer.DEFAULT_MODE;
- }
+ public JapaneseTokenizer create(AttributeFactory factory, Reader input) {
+ return new JapaneseTokenizer(factory, input, userDictionary, discardPunctuation, mode);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java Thu May 30 07:53:18 2013
@@ -20,8 +20,6 @@ package org.apache.lucene.analysis.ja;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.Arrays;
-import java.util.List;
import org.apache.lucene.analysis.util.ResourceLoader;
@@ -34,12 +32,21 @@ class StringMockResourceLoader implement
}
@Override
+ public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
+ try {
+ return Class.forName(cname).asSubclass(expectedType);
+ } catch (Exception e) {
+ throw new RuntimeException("Cannot load class: " + cname, e);
+ }
+ }
+
+ @Override
public <T> T newInstance(String cname, Class<T> expectedType) {
+ Class<? extends T> clazz = findClass(cname, expectedType);
try {
- Class<? extends T> clazz = Class.forName(cname).asSubclass(expectedType);
return clazz.newInstance();
} catch (Exception e) {
- throw new RuntimeException(e);
+ throw new RuntimeException("Cannot create instance: " + cname, e);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilter.java Thu May 30 07:53:18 2013
@@ -25,7 +25,7 @@ import org.apache.lucene.analysis.BaseTo
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
public class TestJapaneseBaseFormFilter extends BaseTokenStreamTestCase {
@@ -49,7 +49,7 @@ public class TestJapaneseBaseFormFilter
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
- TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
}
};
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java Thu May 30 07:53:18 2013
@@ -19,8 +19,7 @@ package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.StringReader;
-import java.util.Collections;
-import java.util.Map;
+import java.util.HashMap;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
@@ -30,16 +29,25 @@ import org.apache.lucene.analysis.TokenS
*/
public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase {
public void testBasics() throws IOException {
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- Map<String, String> args = Collections.emptyMap();
- tokenizerFactory.init(args);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream ts = tokenizerFactory.create(new StringReader("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ã"));
- JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory();
+ JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new HashMap<String,String>());
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "ãã", "ã¯", "ã¾ã ", "å®é¨", "段é", "ã«", "ãã", "ã¾ã" }
);
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new JapaneseBaseFormFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java Thu May 30 07:53:18 2013
@@ -24,7 +24,6 @@ import org.apache.lucene.analysis.TokenS
import java.io.IOException;
import java.io.StringReader;
-import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -35,22 +34,17 @@ public class TestJapaneseIterationMarkCh
public void testIterationMarksWithKeywordTokenizer() throws IOException {
final String text = "æã
馬鹿ã
ã
ããã¨ããããããã¹ã¾";
- JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
CharFilter filter = filterFactory.create(new StringReader(text));
TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false);
assertTokenStreamContents(tokenStream, new String[]{"ææ馬鹿馬鹿ããã¨ããã©ãããã¹ãº"});
}
public void testIterationMarksWithJapaneseTokenizer() throws IOException {
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- Map<String, String> tokenizerArgs = Collections.emptyMap();
- tokenizerFactory.init(tokenizerArgs);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
- JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
- Map<String, String> filterArgs = Collections.emptyMap();
- filterFactory.init(filterArgs);
-
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
CharFilter filter = filterFactory.create(
new StringReader("æã
馬鹿ã
ã
ããã¨ããããããã¹ã¾")
);
@@ -59,16 +53,13 @@ public class TestJapaneseIterationMarkCh
}
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- Map<String, String> tokenizerArgs = Collections.emptyMap();
- tokenizerFactory.init(tokenizerArgs);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
- JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
Map<String, String> filterArgs = new HashMap<String, String>();
filterArgs.put("normalizeKanji", "true");
filterArgs.put("normalizeKana", "false");
- filterFactory.init(filterArgs);
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("æã
馬鹿ã
ã
ããã¨ããããããã¹ã¾")
@@ -78,16 +69,13 @@ public class TestJapaneseIterationMarkCh
}
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- Map<String, String> tokenizerArgs = Collections.emptyMap();
- tokenizerFactory.init(tokenizerArgs);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
- JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
Map<String, String> filterArgs = new HashMap<String, String>();
filterArgs.put("normalizeKanji", "false");
filterArgs.put("normalizeKana", "true");
- filterFactory.init(filterArgs);
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("æã
馬鹿ã
ã
ããã¨ããããããã¹ã¾")
@@ -95,4 +83,16 @@ public class TestJapaneseIterationMarkCh
TokenStream tokenStream = tokenizerFactory.create(filter);
assertTokenStreamContents(tokenStream, new String[]{"æã
", "馬鹿", "ã
", "ã
", "ãã", "ã¨ããã©ãã", "ã", "ã¹ãº"});
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilter.java Thu May 30 07:53:18 2013
@@ -23,7 +23,7 @@ import org.apache.lucene.analysis.MockTo
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
@@ -70,7 +70,7 @@ public class TestJapaneseKatakanaStemFil
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- TokenStream sink = new KeywordMarkerFilter(source, exclusionSet);
+ TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
}
};
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java Thu May 30 07:53:18 2013
@@ -22,27 +22,34 @@ import org.apache.lucene.analysis.TokenS
import java.io.IOException;
import java.io.StringReader;
-import java.util.Collections;
-import java.util.Map;
+import java.util.HashMap;
/**
* Simple tests for {@link JapaneseKatakanaStemFilterFactory}
*/
public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCase {
public void testKatakanaStemming() throws IOException {
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- Map<String, String> tokenizerArgs = Collections.emptyMap();
- tokenizerFactory.init(tokenizerArgs);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream tokenStream = tokenizerFactory.create(
new StringReader("æå¾æ¥ãã¼ãã£ã¼ã«è¡ãäºå®ããããå³æ¸é¤¨ã§è³æãã³ãã¼ãã¾ããã")
);
- JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory();
- Map<String, String> filterArgs = Collections.emptyMap();
- filterFactory.init(filterArgs);
+ JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new HashMap<String,String>());;
assertTokenStreamContents(filterFactory.create(tokenStream),
new String[]{ "æå¾æ¥", "ãã¼ãã£", "ã«", "è¡ã", "äºå®", "ã", "ãã", // ãã¼ãã£ã¼ should be stemmed
"å³æ¸é¤¨", "ã§", "è³æ", "ã", "ã³ãã¼", "ã", "ã¾ã", "ã"} // ã³ãã¼ should not be stemmed
);
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new JapaneseKatakanaStemFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java Thu May 30 07:53:18 2013
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ja;
import java.io.IOException;
import java.io.StringReader;
-import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -35,21 +34,30 @@ public class TestJapanesePartOfSpeechSto
"# verb-main:\n" +
"åè©-èªç«\n";
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- Map<String, String> tokenizerArgs = Collections.emptyMap();
- tokenizerFactory.init(tokenizerArgs);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream ts = tokenizerFactory.create(new StringReader("ç§ã¯å¶éã¹ãã¼ããè¶
ããã"));
- JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory();
Map<String,String> args = new HashMap<String,String>();
+ args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
args.put("tags", "stoptags.txt");
- factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- factory.init(args);
+ JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
factory.inform(new StringMockResourceLoader(tags));
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "ç§", "ã¯", "å¶é", "ã¹ãã¼ã", "ã" }
);
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new JapanesePartOfSpeechStopFilterFactory(new HashMap<String,String>() {{
+ put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java Thu May 30 07:53:18 2013
@@ -22,22 +22,31 @@ import org.apache.lucene.analysis.TokenS
import java.io.IOException;
import java.io.StringReader;
-import java.util.Collections;
-import java.util.Map;
+import java.util.HashMap;
/**
* Simple tests for {@link JapaneseReadingFormFilterFactory}
*/
public class TestJapaneseReadingFormFilterFactory extends BaseTokenStreamTestCase {
public void testReadings() throws IOException {
- JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
- Map<String, String> args = Collections.emptyMap();
- tokenizerFactory.init(args);
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
TokenStream tokenStream = tokenizerFactory.create(new StringReader("å
ã»ã©ãã«ãªã³ããæ¥ã¾ããã"));
- JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory();
+ JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(new HashMap<String,String>());
assertTokenStreamContents(filterFactory.create(tokenStream),
new String[] { "ãµã", "ãã", "ãã«ãªã³", "ã«ã©", "ã", "ãã·", "ã¿" }
);
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new JapaneseReadingFormFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java Thu May 30 07:53:18 2013
@@ -31,10 +31,7 @@ import org.apache.lucene.analysis.TokenS
*/
public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase {
public void testSimple() throws IOException {
- JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
- factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- Map<String, String> args = Collections.emptyMap();
- factory.init(args);
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new HashMap<String,String>());
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(new StringReader("ããã¯æ¬ã§ã¯ãªã"));
assertTokenStreamContents(ts,
@@ -48,10 +45,7 @@ public class TestJapaneseTokenizerFactor
* Test that search mode is enabled and working by default
*/
public void testDefaults() throws IOException {
- JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
- factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- Map<String, String> args = Collections.emptyMap();
- factory.init(args);
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new HashMap<String,String>());
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(new StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"));
assertTokenStreamContents(ts,
@@ -63,10 +57,9 @@ public class TestJapaneseTokenizerFactor
* Test mode parameter: specifying normal mode
*/
public void testMode() throws IOException {
- JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("mode", "normal");
- factory.init(args);
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(new StringReader("ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"));
assertTokenStreamContents(ts,
@@ -84,10 +77,9 @@ public class TestJapaneseTokenizerFactor
"é¢è¥¿å½é空港,é¢è¥¿ å½é 空港,ã«ã³ãµã¤ ã³ã¯ãµã¤ ã¯ã¦ã³ã¦,ãã¹ãåè©\n" +
"# Custom reading for sumo wrestler\n" +
"æéé¾,æéé¾,ã¢ãµã·ã§ã¦ãªã¥ã¦,ã«ã¹ã¿ã 人å\n";
- JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("userDictionary", "userdict.txt");
- factory.init(args);
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(userDict));
TokenStream ts = factory.create(new StringReader("é¢è¥¿å½é空港ã«è¡ã£ã"));
assertTokenStreamContents(ts,
@@ -99,15 +91,13 @@ public class TestJapaneseTokenizerFactor
* Test preserving punctuation
*/
public void testPreservePunctuation() throws IOException {
- JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("discardPunctuation", "false");
- factory.init(args);
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
factory.inform(new StringMockResourceLoader(""));
TokenStream ts = factory.create(
new StringReader("ä»ãã«ã¦ã§ã¼ã«ãã¾ãããæ¥é±ã®é æ¥æ¬ã«æ»ãã¾ãã楽ãã¿ã«ãã¦ãã¾ãï¼ã寿å¸ãé£ã¹ãããªããã")
);
- System.out.println(ts.toString());
assertTokenStreamContents(ts,
new String[] { "ä»", "ãã«ã¦ã§ã¼", "ã«", "ã", "ã¾ã", "ã", "ã",
"æ¥é±", "ã®", "é ", "æ¥æ¬", "ã«", "æ»ã", "ã¾ã", "ã",
@@ -115,4 +105,16 @@ public class TestJapaneseTokenizerFactor
"ã", "寿å¸", "ã", "é£ã¹", "ãã", "ãª", "ã", "ã", "ã"}
);
}
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new JapaneseTokenizerFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+ }
}
Modified: lucene/dev/branches/security/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java Thu May 30 07:53:18 2013
@@ -24,8 +24,6 @@ import java.util.Map;
import morfologik.stemming.PolishStemmer.DICTIONARY;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.morfologik.MorfologikFilter;
-import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
@@ -52,24 +50,10 @@ public class MorfologikFilterFactory ext
/** Schema attribute. */
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
- /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
- public MorfologikFilterFactory() {}
-
- /**
- * {@inheritDoc}
- */
- @Override
- public TokenStream create(TokenStream ts) {
- return new MorfologikFilter(ts, dictionary, luceneMatchVersion);
- }
-
- /**
- * {@inheritDoc}
- */
- @Override
- public void init(Map<String,String> args) {
- super.init(args);
- String dictionaryName = args.get(DICTIONARY_SCHEMA_ATTRIBUTE);
+ /** Creates a new MorfologikFilterFactory */
+ public MorfologikFilterFactory(Map<String,String> args) {
+ super(args);
+ String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
if (dictionaryName != null && !dictionaryName.isEmpty()) {
try {
DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT));
@@ -81,5 +65,13 @@ public class MorfologikFilterFactory ext
+ dictionaryName);
}
}
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream ts) {
+ return new MorfologikFilter(ts, dictionary, luceneMatchVersion);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Thu May 30 07:53:18 2013
@@ -52,6 +52,14 @@ public class TestMorfologikAnalyzer exte
new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
+
+ assertAnalyzesToReuse(
+ a,
+ "T. Gl\u00FCcksberg",
+ new String[] { "to", "tom", "tona", "Gl\u00FCcksberg" },
+ new int[] { 0, 0, 0, 3 },
+ new int[] { 1, 1, 1, 13 },
+ new int[] { 1, 0, 0, 1 });
}
/** Test reuse of MorfologikFilter with leftover stems. */
Modified: lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java Thu May 30 07:53:18 2013
@@ -22,8 +22,8 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Test for {@link MorfologikFilterFactory}.
@@ -34,11 +34,21 @@ public class TestMorfologikFilterFactory
Map<String,String> initParams = new HashMap<String,String>();
initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE,
"morfologik");
- MorfologikFilterFactory factory = new MorfologikFilterFactory();
- factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
- factory.init(initParams);
- TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT,
- reader));
- assertTokenStreamContents(ts, new String[] {"rower", "bilet"});
+ MorfologikFilterFactory factory = new MorfologikFilterFactory(initParams);
+ TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ stream = factory.create(stream);
+ assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ try {
+ new MorfologikFilterFactory(new HashMap<String,String>() {{
+ put("bogusArg", "bogusValue");
+ }});
+ fail();
+ } catch (IllegalArgumentException expected) {
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
}
}
Modified: lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java Thu May 30 07:53:18 2013
@@ -27,7 +27,6 @@ import org.apache.commons.codec.language
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
@@ -48,13 +47,11 @@ public final class BeiderMorseFilter ext
private final Matcher matcher = pattern.matcher("");
// encoded representation
private String encoded;
- // offsets for any buffered outputs
- private int startOffset;
- private int endOffset;
+ // preserves all attributes for any buffered outputs
+ private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
@@ -83,10 +80,10 @@ public final class BeiderMorseFilter ext
@Override
public boolean incrementToken() throws IOException {
if (matcher.find()) {
- clearAttributes();
+ assert state != null && encoded != null;
+ restoreState(state);
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
posIncAtt.setPositionIncrement(0);
- offsetAtt.setOffset(startOffset, endOffset);
return true;
}
@@ -94,8 +91,7 @@ public final class BeiderMorseFilter ext
encoded = (languages == null)
? engine.encode(termAtt.toString())
: engine.encode(termAtt.toString(), languages);
- startOffset = offsetAtt.startOffset();
- endOffset = offsetAtt.endOffset();
+ state = captureState();
matcher.reset(encoded);
if (matcher.find()) {
termAtt.setEmpty().append(encoded, matcher.start(1), matcher.end(1));
Modified: lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java Thu May 30 07:53:18 2013
@@ -17,22 +17,19 @@ package org.apache.lucene.analysis.phone
* limitations under the License.
*/
-import java.util.Arrays;
-import java.util.HashSet;
import java.util.Map;
+import java.util.Set;
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
-import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link BeiderMorseFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_bm" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
@@ -42,36 +39,27 @@ import org.apache.lucene.analysis.util.T
* </filter>
* </analyzer>
* </fieldType></pre>
- *
*/
public class BeiderMorseFilterFactory extends TokenFilterFactory {
- private PhoneticEngine engine;
- private LanguageSet languageSet;
+ private final PhoneticEngine engine;
+ private final LanguageSet languageSet;
- /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
- public BeiderMorseFilterFactory() {}
-
- @Override
- public void init(Map<String,String> args) {
- super.init(args);
-
+ /** Creates a new BeiderMorseFilterFactory */
+ public BeiderMorseFilterFactory(Map<String,String> args) {
+ super(args);
// PhoneticEngine = NameType + RuleType + concat
// we use common-codec's defaults: GENERIC + APPROX + true
- String nameTypeArg = args.get("nameType");
- NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
-
- String ruleTypeArg = args.get("ruleType");
- RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
+ NameType nameType = NameType.valueOf(get(args, "nameType", NameType.GENERIC.toString()));
+ RuleType ruleType = RuleType.valueOf(get(args, "ruleType", RuleType.APPROX.toString()));
- boolean concat = getBoolean("concat", true);
+ boolean concat = getBoolean(args, "concat", true);
engine = new PhoneticEngine(nameType, ruleType, concat);
// LanguageSet: defaults to automagic, otherwise a comma-separated list.
- String languageSetArg = args.get("languageSet");
- if (languageSetArg == null || languageSetArg.equals("auto")) {
- languageSet = null;
- } else {
- languageSet = LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
+ Set<String> langs = getSet(args, "languageSet");
+ languageSet = (null == langs || (1 == langs.size() && langs.contains("auto"))) ? null : LanguageSet.from(langs);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java Thu May 30 07:53:18 2013
@@ -21,19 +21,17 @@ import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
-import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link DoubleMetaphoneFilter}.
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_dblmtphn" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.DoubleMetaphoneFilterFactory" inject="true" maxCodeLength="4"/>
* </analyzer>
* </fieldType></pre>
- *
*/
public class DoubleMetaphoneFilterFactory extends TokenFilterFactory
{
@@ -44,20 +42,16 @@ public class DoubleMetaphoneFilterFactor
/** default maxCodeLength if not specified */
public static final int DEFAULT_MAX_CODE_LENGTH = 4;
- private boolean inject = true;
- private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
+ private final boolean inject;
+ private final int maxCodeLength;
- /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
- public DoubleMetaphoneFilterFactory() {}
-
- @Override
- public void init(Map<String, String> args) {
- super.init(args);
-
- inject = getBoolean(INJECT, true);
-
- if (args.get(MAX_CODE_LENGTH) != null) {
- maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH));
+ /** Creates a new DoubleMetaphoneFilterFactory */
+ public DoubleMetaphoneFilterFactory(Map<String,String> args) {
+ super(args);
+ inject = getBoolean(args, INJECT, true);
+ maxCodeLength = getInt(args, MAX_CODE_LENGTH, DEFAULT_MAX_CODE_LENGTH);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java Thu May 30 07:53:18 2013
@@ -25,9 +25,13 @@ import java.util.Locale;
import java.util.Map;
import org.apache.commons.codec.Encoder;
-import org.apache.commons.codec.language.*;
+import org.apache.commons.codec.language.Caverphone2;
+import org.apache.commons.codec.language.ColognePhonetic;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -49,7 +53,7 @@ import org.apache.lucene.analysis.util.T
* support this then specifying this is an error.</dd>
* </dl>
*
- * <pre class="prettyprint" >
+ * <pre class="prettyprint">
* <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -59,9 +63,7 @@ import org.apache.lucene.analysis.util.T
*
* @see PhoneticFilter
*/
-public class PhoneticFilterFactory extends TokenFilterFactory
- implements ResourceLoaderAware
-{
+public class PhoneticFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
/** parameter name: either a short name or a full class name */
public static final String ENCODER = "encoder";
/** parameter name: true if encoded tokens should be added as synonyms */
@@ -82,33 +84,36 @@ public class PhoneticFilterFactory exten
registry.put("ColognePhonetic".toUpperCase(Locale.ROOT), ColognePhonetic.class);
}
- boolean inject = true; //accessed by the test
- private String name = null;
+ final boolean inject; //accessed by the test
+ private final String name;
+ private final Integer maxCodeLength;
private Class<? extends Encoder> clazz = null;
private Method setMaxCodeLenMethod = null;
- private Integer maxCodeLength = null;
- /** Sole constructor. See {@link AbstractAnalysisFactory} for initialization lifecycle. */
- public PhoneticFilterFactory() {}
+ /** Creates a new PhoneticFilterFactory */
+ public PhoneticFilterFactory(Map<String,String> args) {
+ super(args);
+ inject = getBoolean(args, INJECT, true);
+ name = require(args, ENCODER);
+ String v = get(args, MAX_CODE_LENGTH);
+ if (v != null) {
+ maxCodeLength = Integer.valueOf(v);
+ } else {
+ maxCodeLength = null;
+ }
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
@Override
public void inform(ResourceLoader loader) throws IOException {
-
- inject = getBoolean(INJECT, true);
-
- String name = args.get( ENCODER );
- if( name == null ) {
- throw new IllegalArgumentException("Missing required parameter: " + ENCODER
- + " [" + registry.keySet() + "]");
- }
clazz = registry.get(name.toUpperCase(Locale.ROOT));
if( clazz == null ) {
clazz = resolveEncoder(name, loader);
}
- String v = args.get(MAX_CODE_LENGTH);
- if (v != null) {
- maxCodeLength = Integer.valueOf(v);
+ if (maxCodeLength != null) {
try {
setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class);
} catch (Exception e) {
Modified: lucene/dev/branches/security/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilter.java Thu May 30 07:53:18 2013
@@ -19,7 +19,9 @@ package org.apache.lucene.analysis.phone
import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
import java.util.HashSet;
+import java.util.regex.Pattern;
import org.apache.commons.codec.language.bm.NameType;
import org.apache.commons.codec.language.bm.PhoneticEngine;
@@ -29,7 +31,10 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.junit.Ignore;
/** Tests {@link BeiderMorseFilter} */
@@ -103,4 +108,20 @@ public class TestBeiderMorseFilter exten
};
checkOneTermReuse(a, "", "");
}
+
+ public void testCustomAttribute() throws IOException {
+ TokenStream stream = new KeywordTokenizer(new StringReader("D'Angelo"));
+ stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
+ stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
+ KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
+ stream.reset();
+ int i = 0;
+ while(stream.incrementToken()) {
+ assertTrue(keyAtt.isKeyword());
+ i++;
+ }
+ assertEquals(12, i);
+ stream.end();
+ stream.close();
+ }
}