You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/23 17:36:29 UTC
[04/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji +
tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs
new file mode 100644
index 0000000..609803f
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilter.cs
@@ -0,0 +1,84 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class TestJapaneseBaseFormFilter : BaseTokenStreamTestCase
+ {
+ private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
+ return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer));
+ });
+
+
+ [Test]
+ public void TestBasics()
+ {
+ AssertAnalyzesTo(analyzer, "それはまだ実験段階にあります",
+ new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます" }
+ );
+ }
+
+ [Test]
+ public void TestKeyword()
+ {
+ CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("あり"), false);
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE);
+ TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
+ });
+
+ AssertAnalyzesTo(a, "それはまだ実験段階にあります",
+ new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" }
+ );
+ }
+
+ [Test]
+ public void TestEnglish()
+ {
+ AssertAnalyzesTo(analyzer, "this atest",
+ new String[] { "this", "atest" });
+ }
+
+ [Test]
+ public void TestRandomStrings()
+ {
+ CheckRandomData(Random(), analyzer, AtLeast(1000));
+ }
+
+ [Test]
+ public void TestEmptyTerm()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new JapaneseBaseFormFilter(tokenizer));
+ });
+
+ CheckOneTerm(a, "", "");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs
new file mode 100644
index 0000000..61a8b2e
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseBaseFormFilterFactory.cs
@@ -0,0 +1,60 @@
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests for <see cref="JapaneseBaseFormFilterFactory"/>
+ /// </summary>
+ public class TestJapaneseBaseFormFilterFactory : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestBasics()
+ {
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+ TokenStream ts = tokenizerFactory.Create(new StringReader("それはまだ実験段階にあります"));
+ JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(new Dictionary<String, String>());
+ ts = factory.Create(ts);
+ AssertTokenStreamContents(ts,
+ new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます" }
+ );
+ }
+
+ /** Test that bogus arguments result in exception */
+ [Test]
+ public void TestBogusArguments()
+ {
+ try
+ {
+ new JapaneseBaseFormFilterFactory(new Dictionary<String, String>() {
+ { "bogusArg", "bogusValue" }
+ });
+ fail();
+ }
+ catch (ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs
new file mode 100644
index 0000000..9db0903
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilter.cs
@@ -0,0 +1,241 @@
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class TestJapaneseIterationMarkCharFilter : BaseTokenStreamTestCase
+ {
+ private Analyzer keywordAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ },
+ initReader: (fieldName, reader) =>
+ {
+ return new JapaneseIterationMarkCharFilter(reader);
+ });
+
+
+ private Analyzer japaneseAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, false, JapaneseTokenizerMode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ },
+ initReader: (fieldName, reader) =>
+ {
+ return new JapaneseIterationMarkCharFilter(reader);
+ });
+
+ [Test]
+ public void TestKanji()
+ {
+ // Test single repetition
+ AssertAnalyzesTo(keywordAnalyzer, "時々", new String[] { "時時" });
+ AssertAnalyzesTo(japaneseAnalyzer, "時々", new String[] { "時時" });
+
+ // Test multiple repetitions
+ AssertAnalyzesTo(keywordAnalyzer, "馬鹿々々しい", new String[] { "馬鹿馬鹿しい" });
+ AssertAnalyzesTo(japaneseAnalyzer, "馬鹿々々しい", new String[] { "馬鹿馬鹿しい" });
+ }
+
+ [Test]
+ public void TestKatakana()
+ {
+ // Test single repetition
+ AssertAnalyzesTo(keywordAnalyzer, "ミスヾ", new String[] { "ミスズ" });
+ AssertAnalyzesTo(japaneseAnalyzer, "ミスヾ", new String[] { "ミ", "スズ" }); // Side effect
+ }
+
+ [Test]
+ public void testHiragana()
+ {
+ // Test single unvoiced iteration
+ AssertAnalyzesTo(keywordAnalyzer, "おゝの", new String[] { "おおの" });
+ AssertAnalyzesTo(japaneseAnalyzer, "おゝの", new String[] { "お", "おの" }); // Side effect
+
+ // Test single voiced iteration
+ AssertAnalyzesTo(keywordAnalyzer, "みすゞ", new String[] { "みすず" });
+ AssertAnalyzesTo(japaneseAnalyzer, "みすゞ", new String[] { "みすず" });
+
+ // Test single voiced iteration
+ AssertAnalyzesTo(keywordAnalyzer, "じゞ", new String[] { "じじ" });
+ AssertAnalyzesTo(japaneseAnalyzer, "じゞ", new String[] { "じじ" });
+
+ // Test single unvoiced iteration with voiced iteration
+ AssertAnalyzesTo(keywordAnalyzer, "じゝ", new String[] { "じし" });
+ AssertAnalyzesTo(japaneseAnalyzer, "じゝ", new String[] { "じし" });
+
+ // Test multiple repetitions with voiced iteration
+ AssertAnalyzesTo(keywordAnalyzer, "ところゞゝゝ", new String[] { "ところどころ" });
+ AssertAnalyzesTo(japaneseAnalyzer, "ところゞゝゝ", new String[] { "ところどころ" });
+ }
+
+ [Test]
+ public void TestMalformed()
+ {
+ // We can't iterate c here, so emit as it is
+ AssertAnalyzesTo(keywordAnalyzer, "abcところゝゝゝゝ", new String[] { "abcところcところ" });
+
+ // We can't iterate c (with dakuten change) here, so emit it as-is
+ AssertAnalyzesTo(keywordAnalyzer, "abcところゞゝゝゝ", new String[] { "abcところcところ" });
+
+ // We can't iterate before beginning of stream, so emit characters as-is
+ AssertAnalyzesTo(keywordAnalyzer, "ところゞゝゝゞゝゞ", new String[] { "ところどころゞゝゞ" });
+
+ // We can't iterate an iteration mark only, so emit as-is
+ AssertAnalyzesTo(keywordAnalyzer, "々", new String[] { "々" });
+ AssertAnalyzesTo(keywordAnalyzer, "ゞ", new String[] { "ゞ" });
+ AssertAnalyzesTo(keywordAnalyzer, "ゞゝ", new String[] { "ゞゝ" });
+
+ // We can't iterate a full stop punctuation mark (because we use it as a flush marker)
+ AssertAnalyzesTo(keywordAnalyzer, "。ゝ", new String[] { "。ゝ" });
+ AssertAnalyzesTo(keywordAnalyzer, "。。ゝゝ", new String[] { "。。ゝゝ" });
+
+ // We can iterate other punctuation marks
+ AssertAnalyzesTo(keywordAnalyzer, "?ゝ", new String[] { "??" });
+
+ // We can not get a dakuten variant of ぽ -- this is also a corner case test for inside()
+ AssertAnalyzesTo(keywordAnalyzer, "ねやぽゞつむぴ", new String[] { "ねやぽぽつむぴ" });
+ AssertAnalyzesTo(keywordAnalyzer, "ねやぽゝつむぴ", new String[] { "ねやぽぽつむぴ" });
+ }
+
+ [Test]
+ public void TestEmpty()
+ {
+ // Empty input stays empty
+ AssertAnalyzesTo(keywordAnalyzer, "", new String[0]);
+ AssertAnalyzesTo(japaneseAnalyzer, "", new String[0]);
+ }
+
+ [Test]
+ public void TestFullStop()
+ {
+ // Test full stops
+ AssertAnalyzesTo(keywordAnalyzer, "。", new String[] { "。" });
+ AssertAnalyzesTo(keywordAnalyzer, "。。", new String[] { "。。" });
+ AssertAnalyzesTo(keywordAnalyzer, "。。。", new String[] { "。。。" });
+ }
+
+ [Test]
+ public void TestKanjiOnly()
+ {
+ // Test kanji only repetition marks
+ CharFilter filter = new JapaneseIterationMarkCharFilter(
+ new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
+ true, // kanji
+ false // no kana
+ );
+ assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
+ }
+
+ [Test]
+ public void TestKanaOnly()
+ {
+ // Test kana only repetition marks
+ CharFilter filter = new JapaneseIterationMarkCharFilter(
+ new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
+ false, // no kanji
+ true // kana
+ );
+ assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
+ }
+
+ [Test]
+ public void TestNone()
+ {
+ // Test no repetition marks
+ CharFilter filter = new JapaneseIterationMarkCharFilter(
+ new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
+ false, // no kanji
+ false // no kana
+ );
+ assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
+ }
+
+ [Test]
+ public void TestCombinations()
+ {
+ AssertAnalyzesTo(keywordAnalyzer, "時々、おゝのさんと一緒にお寿司を食べに行きます。",
+ new String[] { "時時、おおのさんと一緒にお寿司を食べに行きます。" }
+ );
+ }
+
+ [Test]
+ public void TestHiraganaCoverage()
+ {
+ // Test all hiragana iteration variants
+ String source = "かゝがゝきゝぎゝくゝぐゝけゝげゝこゝごゝさゝざゝしゝじゝすゝずゝせゝぜゝそゝぞゝたゝだゝちゝぢゝつゝづゝてゝでゝとゝどゝはゝばゝひゝびゝふゝぶゝへゝべゝほゝぼゝ";
+ String target = "かかがかききぎきくくぐくけけげけここごこささざさししじしすすずすせせぜせそそぞそたただたちちぢちつつづつててでてととどとははばはひひびひふふぶふへへべへほほぼほ";
+ AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+
+ // Test all hiragana iteration variants with dakuten
+ source = "かゞがゞきゞぎゞくゞぐゞけゞげゞこゞごゞさゞざゞしゞじゞすゞずゞせゞぜゞそゞぞゞたゞだゞちゞぢゞつゞづゞてゞでゞとゞどゞはゞばゞひゞびゞふゞぶゞへゞべゞほゞぼゞ";
+ target = "かがががきぎぎぎくぐぐぐけげげげこごごごさざざざしじじじすずずずせぜぜぜそぞぞぞただだだちぢぢぢつづづづてでででとどどどはばばばひびびびふぶぶぶへべべべほぼぼぼ";
+ AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+ }
+
+ [Test]
+ public void TestKatakanaCoverage()
+ {
+ // Test all katakana iteration variants
+ String source = "カヽガヽキヽギヽクヽグヽケヽゲヽコヽゴヽサヽザヽシヽジヽスヽズヽセヽゼヽソヽゾヽタヽダヽチヽヂヽツヽヅヽテヽデヽトヽドヽハヽバヽヒヽビヽフヽブヽヘヽベヽホヽボヽ";
+ String target = "カカガカキキギキククグクケケゲケココゴコササザサシシジシススズスセセゼセソソゾソタタダタチチヂチツツヅツテテデテトトドトハハバハヒヒビヒフフブフヘヘベヘホホボホ";
+ AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+
+ // Test all katakana iteration variants with dakuten
+ source = "カヾガヾキヾギヾクヾグヾケヾゲヾコヾゴヾサヾザヾシヾジヾスヾズヾセヾゼヾソヾゾヾタヾダヾチヾヂヾツヾヅヾテヾデヾトヾドヾハヾバヾヒヾビヾフヾブヾヘヾベヾホヾボヾ";
+ target = "カガガガキギギギクグググケゲゲゲコゴゴゴサザザザシジジジスズズズセゼゼゼソゾゾゾタダダダチヂヂヂツヅヅヅテデデデトドドドハバババヒビビビフブブブヘベベベホボボボ";
+ AssertAnalyzesTo(keywordAnalyzer, source, new String[] { target });
+ }
+
+ [Test]
+ public void TestRandomStrings()
+ {
+ // Blast some random strings through
+ CheckRandomData(Random(), keywordAnalyzer, 1000 * RANDOM_MULTIPLIER);
+ }
+
+ [Test]
+ public void TestRandomHugeStrings()
+ {
+ // Blast some random strings through
+ CheckRandomData(Random(), keywordAnalyzer, 100 * RANDOM_MULTIPLIER, 8192);
+ }
+
+ private void assertCharFilterEquals(CharFilter filter, String expected)
+ {
+ String actual = readFully(filter);
+ assertEquals(expected, actual);
+ }
+
+ private String readFully(TextReader stream)
+ {
+ StringBuilder buffer = new StringBuilder();
+ int ch;
+ while ((ch = stream.Read()) != -1)
+ {
+ buffer.append((char)ch);
+ }
+ return buffer.toString();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
new file mode 100644
index 0000000..88f71a9
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseIterationMarkCharFilterFactory.cs
@@ -0,0 +1,108 @@
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests for <see cref="JapaneseIterationMarkCharFilterFactory"/>
+ /// </summary>
+ public class TestJapaneseIterationMarkCharFilterFactory : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestIterationMarksWithKeywordTokenizer()
+ {
+ String text = "時々馬鹿々々しいところゞゝゝミスヾ";
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>());
+ TextReader filter = filterFactory.Create(new StringReader(text));
+ TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false);
+ AssertTokenStreamContents(tokenStream, new String[] { "時時馬鹿馬鹿しいところどころミスズ" });
+ }
+
+ [Test]
+ public void TestIterationMarksWithJapaneseTokenizer()
+ {
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>());
+ TextReader filter = filterFactory.Create(
+ new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
+ );
+ TokenStream tokenStream = tokenizerFactory.Create(filter);
+ AssertTokenStreamContents(tokenStream, new String[] { "時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ" });
+ }
+
+ [Test]
+ public void TestKanjiOnlyIterationMarksWithJapaneseTokenizer()
+ {
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+
+ IDictionary<String, String> filterArgs = new Dictionary<String, String>();
+ filterArgs.Put("normalizeKanji", "true");
+ filterArgs.Put("normalizeKana", "false");
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
+
+ TextReader filter = filterFactory.Create(
+ new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
+ );
+ TokenStream tokenStream = tokenizerFactory.Create(filter);
+ AssertTokenStreamContents(tokenStream, new String[] { "時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ" });
+ }
+
+ [Test]
+ public void TestKanaOnlyIterationMarksWithJapaneseTokenizer()
+ {
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+
+ IDictionary<String, String> filterArgs = new Dictionary<String, String>();
+ filterArgs.Put("normalizeKanji", "false");
+ filterArgs.Put("normalizeKana", "true");
+ JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
+
+ TextReader filter = filterFactory.Create(
+ new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
+ );
+ TokenStream tokenStream = tokenizerFactory.Create(filter);
+ AssertTokenStreamContents(tokenStream, new String[] { "時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ [Test]
+ public void TestBogusArguments()
+ {
+ try
+ {
+ new JapaneseIterationMarkCharFilterFactory(new Dictionary<String, String>() {
+ { "bogusArg", "bogusValue" }
+ });
+ fail();
+ }
+ catch (ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs
new file mode 100644
index 0000000..cbbc95b
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilter.cs
@@ -0,0 +1,100 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests for <see cref="JapaneseKatakanaStemFilter"/>
+ /// </summary>
+ public class TestJapaneseKatakanaStemFilter : BaseTokenStreamTestCase
+ {
+ private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ // Use a MockTokenizer here since this filter doesn't really depend on Kuromoji
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(source));
+ });
+
+ /**
+ * Test a few common katakana spelling variations.
+ * <p>
+ * English translations are as follows:
+ * <ul>
+ * <li>copy</li>
+ * <li>coffee</li>
+ * <li>taxi</li>
+ * <li>party</li>
+ * <li>party (without long sound)</li>
+ * <li>center</li>
+ * </ul>
+ * Note that we remove a long sound in the case of "coffee" that is required.
+ * </p>
+ */
+ [Test]
+ public void TestStemVariants()
+ {
+ AssertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター",
+ new String[] { "コピー", "コーヒ", "タクシ", "パーティ", "パーティ", "センタ" },
+ new int[] { 0, 4, 9, 14, 20, 25 },
+ new int[] { 3, 8, 13, 19, 24, 29 });
+ }
+
+ [Test]
+ public void TestKeyword()
+ {
+ CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("コーヒー"), false);
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
+ return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
+ });
+ CheckOneTerm(a, "コーヒー", "コーヒー");
+ }
+
+ [Test]
+ public void TestUnsupportedHalfWidthVariants()
+ {
+ // The below result is expected since only full-width katakana is supported
+ AssertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });
+ }
+
+ [Test]
+ public void TestRandomData()
+ {
+ CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER);
+ }
+
+ [Test]
+ public void TestEmptyTerm()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new JapaneseKatakanaStemFilter(tokenizer));
+ });
+
+ CheckOneTerm(a, "", "");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
new file mode 100644
index 0000000..49ac181
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseKatakanaStemFilterFactory.cs
@@ -0,0 +1,62 @@
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests for <see cref="JapaneseKatakanaStemFilterFactory"/>
+ /// </summary>
+ public class TestJapaneseKatakanaStemFilterFactory : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestKatakanaStemming()
+ {
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+ TokenStream tokenStream = tokenizerFactory.Create(
+ new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。")
+ );
+ JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(new Dictionary<String, String>()); ;
+ AssertTokenStreamContents(filterFactory.Create(tokenStream),
+ new String[]{ "明後日", "パーティ", "に", "行く", "予定", "が", "ある", // パーティー should be stemmed
+ "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"} // コピー should not be stemmed
+ );
+ }
+
+ /** Test that bogus arguments result in exception */
+ [Test]
+ public void TestBogusArguments()
+ {
+ try
+ {
+ new JapaneseKatakanaStemFilterFactory(new Dictionary<String, String>() {
+ { "bogusArg", "bogusValue" }
+ });
+ fail();
+ }
+ catch (ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
new file mode 100644
index 0000000..617a1b8
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapanesePartOfSpeechStopFilterFactory.cs
@@ -0,0 +1,70 @@
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests for <see cref="JapanesePartOfSpeechStopFilterFactory"/>
+ /// </summary>
+ public class TestJapanesePartOfSpeechStopFilterFactory : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestBasics()
+ {
+ String tags =
+ "# verb-main:\n" +
+ "動詞-自立\n";
+
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+ TokenStream ts = tokenizerFactory.Create(new StringReader("私は制限スピードを超える。"));
+ IDictionary<String, String> args = new Dictionary<String, String>();
+ args.Put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
+ args.Put("tags", "stoptags.txt");
+ JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(args);
+ factory.Inform(new StringMockResourceLoader(tags));
+ ts = factory.Create(ts);
+ AssertTokenStreamContents(ts,
+ new String[] { "私", "は", "制限", "スピード", "を" }
+ );
+ }
+
+ /** Test that bogus arguments result in exception */
+ [Test]
+ public void TestBogusArguments()
+ {
+ try
+ {
+ new JapanesePartOfSpeechStopFilterFactory(new Dictionary<String, String>() {
+ { "luceneMatchVersion", TEST_VERSION_CURRENT.toString() },
+ { "bogusArg", "bogusValue" }
+ });
+ fail();
+ }
+ catch (ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs
new file mode 100644
index 0000000..141db33
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilter.cs
@@ -0,0 +1,109 @@
+using Lucene.Net.Analysis.Cjk;
+using Lucene.Net.Analysis.Core;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tests for <see cref="TestJapaneseReadingFormFilter"/>
+ /// </summary>
+ public class TestJapaneseReadingFormFilter : BaseTokenStreamTestCase
+ {
+ private Analyzer katakanaAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, false));
+ });
+
+ private Analyzer romajiAnalyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, true));
+ });
+
+
+ [Test]
+ public void TestKatakanaReadings()
+ {
+ AssertAnalyzesTo(katakanaAnalyzer, "今夜はロバート先生と話した",
+ new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
+ );
+ }
+
+ [Test]
+ public void TestKatakanaReadingsHalfWidth()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
+ });
+
+ AssertAnalyzesTo(a, "今夜はロバート先生と話した",
+ new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
+ );
+ }
+
+ [Test]
+ public void TestRomajiReadings()
+ {
+ AssertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
+ new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+ );
+ }
+
+ [Test]
+ public void TestRomajiReadingsHalfWidth()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
+ });
+
+ AssertAnalyzesTo(a, "今夜はロバート先生と話した",
+ new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+ );
+ }
+
+ [Test]
+ public void TestRandomData()
+ {
+ Random random = Random();
+ CheckRandomData(random, katakanaAnalyzer, 1000 * RANDOM_MULTIPLIER);
+ CheckRandomData(random, romajiAnalyzer, 1000 * RANDOM_MULTIPLIER);
+ }
+
+ [Test]
+ public void TestEmptyTerm()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer));
+ });
+
+ CheckOneTerm(a, "", "");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
new file mode 100644
index 0000000..053652b
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseReadingFormFilterFactory.cs
@@ -0,0 +1,59 @@
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests for <see cref="JapaneseReadingFormFilterFactory"/>
+ /// </summary>
+ public class TestJapaneseReadingFormFilterFactory : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestReadings()
+ {
+ JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ tokenizerFactory.Inform(new StringMockResourceLoader(""));
+ TokenStream tokenStream = tokenizerFactory.Create(new StringReader("先ほどベルリンから来ました。"));
+ JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(new Dictionary<String, String>());
+ AssertTokenStreamContents(filterFactory.Create(tokenStream),
+ new String[] { "サキ", "ホド", "ベルリン", "カラ", "キ", "マシ", "タ" }
+ );
+ }
+
+ /** Test that bogus arguments result in exception */
+ [Test]
+ public void TestBogusArguments()
+ {
+ try
+ {
+ new JapaneseReadingFormFilterFactory(new Dictionary<String, String>() {
+ { "bogusArg", "bogusValue" }
+ });
+ fail();
+ }
+ catch (ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
new file mode 100644
index 0000000..0a1f819
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizer.cs
@@ -0,0 +1,846 @@
+using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class TestJapaneseTokenizer : BaseTokenStreamTestCase
+ {
+ public static UserDictionary ReadDict()
+ {
+ Stream @is = typeof(TestJapaneseTokenizer).getResourceAsStream("userdict.txt");
+ if (@is == null)
+ {
+ throw new Exception("Cannot find userdict.txt in test classpath!");
+ }
+ try
+ {
+ try
+ {
+ TextReader reader = new StreamReader(@is, Encoding.UTF8);
+ return new UserDictionary(reader);
+ }
+ finally
+ {
+ @is.Dispose();
+ }
+ }
+ catch (IOException ioe)
+ {
+ throw new Exception(ioe.ToString(), ioe);
+ }
+ }
+
+ private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ });
+
+
+ private Analyzer analyzerNormal = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.NORMAL);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ });
+
+ private Analyzer analyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ });
+
+
+ private Analyzer extendedModeAnalyzerNoPunct = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), true, JapaneseTokenizerMode.EXTENDED);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ });
+
+
+ [Test]
+ public void TestNormalMode()
+ {
+ AssertAnalyzesTo(analyzerNormal,
+ "シニアソフトウェアエンジニア",
+ new String[] { "シニアソフトウェアエンジニア" });
+ }
+
+ [Test]
+ public void TestDecomposition1()
+ {
+ AssertAnalyzesTo(analyzerNoPunct, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
+ "アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。",
+ new String[] { "本来", "は", "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を",
+ "提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある", "アメリカ",
+ "低", "所得", "者", "医療", "援助", "制度", "が", "今日", "で", "は", "その",
+ "予算", "の", "約", "3", "分の", "1", "を", "老人", "に", "費やし", "て", "いる" },
+ new int[] { 0, 2, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30,
+ 31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 60,
+ 62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 },
+ new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
+ 33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 62,
+ 63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 }
+ );
+ }
+
+ [Test]
+ public void TestDecomposition2()
+ {
+ AssertAnalyzesTo(analyzerNoPunct, "麻薬の密売は根こそぎ絶やさなければならない",
+ new String[] { "麻薬", "の", "密売", "は", "根こそぎ", "絶やさ", "なけれ", "ば", "なら", "ない" },
+ new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 },
+ new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
+ );
+ }
+
+ [Test]
+ public void TestDecomposition3()
+ {
+ AssertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。",
+ new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス" },
+ new int[] { 0, 2, 3, 5, 10 },
+ new int[] { 2, 3, 5, 9, 15 }
+ );
+ }
+
+ [Test]
+ public void TestDecomposition4()
+ {
+ AssertAnalyzesTo(analyzer, "これは本ではない",
+ new String[] { "これ", "は", "本", "で", "は", "ない" },
+ new int[] { 0, 2, 3, 4, 5, 6 },
+ new int[] { 2, 3, 4, 5, 6, 8 }
+ );
+ }
+
+ /* Note this is really a stupid test just to see if things arent horribly slow.
+ * ideally the test would actually fail instead of hanging...
+ */
+ [Test]
+ public void TestDecomposition5()
+ {
+ TokenStream ts = analyzer.GetTokenStream("bogus", "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ");
+ try
+ {
+ ts.Reset();
+ while (ts.IncrementToken())
+ {
+
+ }
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ /*
+ // NOTE: intentionally fails! Just trying to debug this
+ // one input...
+ public void testDecomposition6() throws Exception {
+ assertAnalyzesTo(analyzer, "奈良先端科学技術大学院大学",
+ new String[] { "これ", "は", "本", "で", "は", "ない" },
+ new int[] { 0, 2, 3, 4, 5, 6 },
+ new int[] { 2, 3, 4, 5, 6, 8 }
+ );
+ }
+ */
+
+ /** Tests that sentence offset is incorporated into the resulting offsets */
+ [Test]
+ public void TestTwoSentences()
+ {
+ /*
+ //TokenStream ts = a.tokenStream("foo", "妹の咲子です。俺と年子で、今受験生です。");
+ TokenStream ts = analyzer.tokenStream("foo", "�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;");
+ ts.reset();
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ while(ts.incrementToken()) {
+ System.out.println(" " + termAtt.toString());
+ }
+ System.out.println("DONE PARSE\n\n");
+ */
+
+ AssertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
+ new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス", "魔女", "狩", "大将", "マシュー", "ホプキンス" },
+ new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
+ new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
+ );
+ }
+
+ /** blast some random strings through the analyzer */
+ [Test]
+ public void TestRandomStrings()
+ {
+ CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER);
+ CheckRandomData(Random(), analyzerNoPunct, 1000 * RANDOM_MULTIPLIER);
+ }
+
+ /** blast some random large strings through the analyzer */
+ [Test]
+ public void TestRandomHugeStrings()
+ {
+ Random random = Random();
+ CheckRandomData(random, analyzer, 100 * RANDOM_MULTIPLIER, 8192);
+ CheckRandomData(random, analyzerNoPunct, 100 * RANDOM_MULTIPLIER, 8192);
+ }
+
+ [Test]
+ public void TestRandomHugeStringsMockGraphAfter()
+ {
+ // Randomly inject graph tokens after JapaneseTokenizer:
+ Random random = Random();
+ CheckRandomData(random,
+ Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH);
+ TokenStream graph = new MockGraphTokenFilter(Random(), tokenizer);
+ return new TokenStreamComponents(tokenizer, graph);
+ }),
+ 100 * RANDOM_MULTIPLIER, 8192);
+ }
+
+ [Test]
+ public void TestLargeDocReliability()
+ {
+ for (int i = 0; i < 100; i++)
+ {
+ String s = TestUtil.RandomUnicodeString(Random(), 10000);
+ TokenStream ts = analyzer.GetTokenStream("foo", s);
+ try
+ {
+ ts.Reset();
+ while (ts.IncrementToken())
+ {
+ }
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+ }
+
+ /** simple test for supplementary characters */
+ [Test]
+ public void TestSurrogates()
+ {
+ AssertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
+ new String[] { "𩬅", "艱", "鍟", "䇹", "愯", "瀛" });
+ }
+
+ /** random test ensuring we don't ever split supplementaries */
+ [Test]
+ public void TestSurrogates2()
+ {
+ int numIterations = AtLeast(10000);
+ for (int i = 0; i < numIterations; i++)
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine("\nTEST: iter=" + i);
+ }
+ String s = TestUtil.RandomUnicodeString(Random(), 100);
+ TokenStream ts = analyzer.GetTokenStream("foo", s);
+ try
+ {
+ ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>();
+ ts.Reset();
+ while (ts.IncrementToken())
+ {
+ assertTrue(UnicodeUtil.ValidUTF16String(termAtt));
+ }
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+ }
+
+ [Test]
+ public void TestOnlyPunctuation()
+ {
+ TokenStream ts = analyzerNoPunct.GetTokenStream("foo", "。、。。");
+ try
+ {
+ ts.Reset();
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ [Test]
+ public void TestOnlyPunctuationExtended()
+ {
+ TokenStream ts = extendedModeAnalyzerNoPunct.GetTokenStream("foo", "......");
+ try
+ {
+ ts.Reset();
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ // note: test is kinda silly since kuromoji emits punctuation tokens.
+ // but, when/if we filter these out it will be useful.
+ [Test]
+ public void TestEnd()
+ {
+ AssertTokenStreamContents(analyzerNoPunct.GetTokenStream("foo", "これは本ではない"),
+ new String[] { "これ", "は", "本", "で", "は", "ない" },
+ new int[] { 0, 2, 3, 4, 5, 6 },
+ new int[] { 2, 3, 4, 5, 6, 8 },
+ new int?(8)
+ );
+
+ AssertTokenStreamContents(analyzerNoPunct.GetTokenStream("foo", "これは本ではない "),
+ new String[] { "これ", "は", "本", "で", "は", "ない" },
+ new int[] { 0, 2, 3, 4, 5, 6, 8 },
+ new int[] { 2, 3, 4, 5, 6, 8, 9 },
+ new int?(12)
+ );
+ }
+
+ [Test]
+ public void TestUserDict()
+ {
+ // Not a great test because w/o userdict.txt the
+ // segmentation is the same:
+ AssertTokenStreamContents(analyzer.GetTokenStream("foo", "関西国際空港に行った"),
+ new String[] { "関西", "国際", "空港", "に", "行っ", "た" },
+ new int[] { 0, 2, 4, 6, 7, 9 },
+ new int[] { 2, 4, 6, 7, 9, 10 },
+ new int?(10)
+ );
+ }
+
+ [Test]
+ public void TestUserDict2()
+ {
+ // Better test: w/o userdict the segmentation is different:
+ AssertTokenStreamContents(analyzer.GetTokenStream("foo", "朝青龍"),
+ new String[] { "朝青龍" },
+ new int[] { 0 },
+ new int[] { 3 },
+ new int?(3)
+ );
+ }
+
+ [Test]
+ public void TestUserDict3()
+ {
+ // Test entry that breaks into multiple tokens:
+ AssertTokenStreamContents(analyzer.GetTokenStream("foo", "abcd"),
+ new String[] { "a", "b", "cd" },
+ new int[] { 0, 1, 2 },
+ new int[] { 1, 2, 4 },
+ new int?(4)
+ );
+ }
+
+ // HMM: fails (segments as a/b/cd/efghij)... because the
+ // two paths have exactly equal paths (1 KNOWN + 1
+ // UNKNOWN) and we don't seem to favor longer KNOWN /
+ // shorter UNKNOWN matches:
+
+ /*
+ public void testUserDict4() {
+ // Test entry that has another entry as prefix
+ assertTokenStreamContents(analyzer.tokenStream("foo", "abcdefghij"),
+ new String[] { "ab", "cd", "efg", "hij" },
+ new int[] { 0, 2, 4, 7 },
+ new int[] { 2, 4, 7, 10 },
+ new int?(10)
+ );
+ }
+ */
+
+ [Test]
+ public void TestSegmentation()
+ {
+ // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
+ // String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
+ // String[] surfaceForms = {
+ // "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
+ // "スペース", "ステーション", "に", "行き", "ます", "。",
+ // "うたがわしい", "。"
+ // };
+ String input = "スペースステーションに行きます。うたがわしい。";
+ String[]
+ surfaceForms = {
+ "スペース", "ステーション", "に", "行き", "ます", "。",
+ "うたがわしい", "。"
+ };
+ AssertAnalyzesTo(analyzer,
+ input,
+ surfaceForms);
+ }
+
+ [Test]
+ public void TestLatticeToDot()
+ {
+ GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.GetInstance());
+ Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH)
+ {
+ GraphvizFormatter = gv2
+ };
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ });
+
+
+ String input = "スペースステーションに行きます。うたがわしい。";
+ String[] surfaceForms = {
+ "スペース", "ステーション", "に", "行き", "ます", "。",
+ "うたがわしい", "。"
+ };
+ AssertAnalyzesTo(analyzer,
+ input,
+ surfaceForms);
+
+
+ assertTrue(gv2.Finish().IndexOf("22.0") != -1);
+ }
+
+ private void assertReadings(String input, params String[] readings)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", input);
+ try
+ {
+ IReadingAttribute readingAtt = ts.AddAttribute<IReadingAttribute>();
+ ts.Reset();
+ foreach (String reading in readings)
+ {
+ assertTrue(ts.IncrementToken());
+ assertEquals(reading, readingAtt.GetReading());
+ }
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ private void assertPronunciations(String input, params String[] pronunciations)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", input);
+ try
+ {
+ IReadingAttribute readingAtt = ts.AddAttribute<IReadingAttribute>();
+ ts.Reset();
+ foreach (String pronunciation in pronunciations)
+ {
+ assertTrue(ts.IncrementToken());
+ assertEquals(pronunciation, readingAtt.GetPronunciation());
+ }
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ private void assertBaseForms(String input, params String[] baseForms)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", input);
+ try
+ {
+ IBaseFormAttribute baseFormAtt = ts.AddAttribute<IBaseFormAttribute>();
+ ts.Reset();
+ foreach (String baseForm in baseForms)
+ {
+ assertTrue(ts.IncrementToken());
+ assertEquals(baseForm, baseFormAtt.GetBaseForm());
+ }
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ private void assertInflectionTypes(String input, params String[] inflectionTypes)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", input);
+ try
+ {
+ IInflectionAttribute inflectionAtt = ts.AddAttribute<IInflectionAttribute>();
+ ts.Reset();
+ foreach (String inflectionType in inflectionTypes)
+ {
+ assertTrue(ts.IncrementToken());
+ assertEquals(inflectionType, inflectionAtt.GetInflectionType());
+ }
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ private void assertInflectionForms(String input, params String[] inflectionForms)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", input);
+ try
+ {
+ IInflectionAttribute inflectionAtt = ts.AddAttribute<IInflectionAttribute>();
+ ts.Reset();
+ foreach (String inflectionForm in inflectionForms)
+ {
+ assertTrue(ts.IncrementToken());
+ assertEquals(inflectionForm, inflectionAtt.GetInflectionForm());
+ }
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ private void assertPartsOfSpeech(String input, params String[] partsOfSpeech)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", input);
+ try
+ {
+ IPartOfSpeechAttribute partOfSpeechAtt = ts.AddAttribute<IPartOfSpeechAttribute>();
+ ts.Reset();
+ foreach (String partOfSpeech in partsOfSpeech)
+ {
+ assertTrue(ts.IncrementToken());
+ assertEquals(partOfSpeech, partOfSpeechAtt.GetPartOfSpeech());
+ }
+ assertFalse(ts.IncrementToken());
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+
+ [Test]
+ public void TestReadings()
+ {
+ assertReadings("寿司が食べたいです。",
+ "スシ",
+ "ガ",
+ "タベ",
+ "タイ",
+ "デス",
+ "。");
+ }
+
+ [Test]
+ public void TestReadings2()
+ {
+ assertReadings("多くの学生が試験に落ちた。",
+ "オオク",
+ "ノ",
+ "ガクセイ",
+ "ガ",
+ "シケン",
+ "ニ",
+ "オチ",
+ "タ",
+ "。");
+ }
+
+ [Test]
+ public void TestPronunciations()
+ {
+ assertPronunciations("寿司が食べたいです。",
+ "スシ",
+ "ガ",
+ "タベ",
+ "タイ",
+ "デス",
+ "。");
+ }
+
+ [Test]
+ public void TestPronunciations2()
+ {
+ // pronunciation differs from reading here
+ assertPronunciations("多くの学生が試験に落ちた。",
+ "オーク",
+ "ノ",
+ "ガクセイ",
+ "ガ",
+ "シケン",
+ "ニ",
+ "オチ",
+ "タ",
+ "。");
+ }
+
+ [Test]
+ public void TestBasicForms()
+ {
+ assertBaseForms("それはまだ実験段階にあります。",
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ "ある",
+ null,
+ null);
+ }
+
+ [Test]
+ public void TestInflectionTypes()
+ {
+ assertInflectionTypes("それはまだ実験段階にあります。",
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ "五段・ラ行",
+ "特殊・マス",
+ null);
+ }
+
+ [Test]
+ public void TestInflectionForms()
+ {
+ assertInflectionForms("それはまだ実験段階にあります。",
+ null,
+ null,
+ null,
+ null,
+ null,
+ null,
+ "連用形",
+ "基本形",
+ null);
+ }
+
+ [Test]
+ public void TestPartOfSpeech()
+ {
+ assertPartsOfSpeech("それはまだ実験段階にあります。",
+ "名詞-代名詞-一般",
+ "助詞-係助詞",
+ "副詞-助詞類接続",
+ "名詞-サ変接続",
+ "名詞-一般",
+ "助詞-格助詞-一般",
+ "動詞-自立",
+ "助動詞",
+ "記号-句点");
+ }
+
+ // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
+ // do we have a possibility to actually lookup the first and last word from dictionary?
+ [Test]
+ public void TestYabottai()
+ {
+ AssertAnalyzesTo(analyzer, "やぼったい",
+ new String[] { "やぼったい" });
+ }
+
+ [Test]
+ public void TestTsukitosha()
+ {
+ AssertAnalyzesTo(analyzer, "突き通しゃ",
+ new String[] { "突き通しゃ" });
+ }
+
+ [Test]
+ public void TestBocchan()
+ {
+ doTestBocchan(1);
+ }
+
+ [Test, LongRunningTest]//@Nightly
+ [Ignore("This test takes a long time to run - do it manually")]
+ public void TestBocchanBig()
+ {
+ doTestBocchan(100);
+ }
+
+ /*
+ public void testWikipedia() {
+ final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
+ final Reader r = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8));
+
+ final long startTimeNS = System.nanoTime();
+ boolean done = false;
+ long compoundCount = 0;
+ long nonCompoundCount = 0;
+ long netOffset = 0;
+ while (!done) {
+ final TokenStream ts = analyzer.tokenStream("ignored", r);
+ ts.reset();
+ final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ int count = 0;
+ while (true) {
+ if (!ts.incrementToken()) {
+ done = true;
+ break;
+ }
+ count++;
+ if (posIncAtt.getPositionIncrement() == 0) {
+ compoundCount++;
+ } else {
+ nonCompoundCount++;
+ if (nonCompoundCount % 1000000 == 0) {
+ System.out.println(String.format("%.2f msec [pos=%d, %d, %d]",
+ (System.nanoTime()-startTimeNS)/1000000.0,
+ netOffset + offsetAtt.startOffset(),
+ nonCompoundCount,
+ compoundCount));
+ }
+ }
+ if (count == 100000000) {
+ System.out.println(" again...");
+ break;
+ }
+ }
+ ts.end();
+ netOffset += offsetAtt.endOffset();
+ }
+ System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount);
+ r.close();
+ }
+ */
+
+
+ private void doTestBocchan(int numIterations)
+ {
+ TextReader reader = new StreamReader(
+ this.GetType().getResourceAsStream("bocchan.utf-8"), Encoding.UTF8);
+ String line = reader.ReadLine();
+ reader.Dispose();
+
+ if (VERBOSE)
+ {
+ Console.WriteLine("Test for Bocchan without pre-splitting sentences");
+ }
+
+ /*
+ if (numIterations > 1) {
+ // warmup
+ for (int i = 0; i < numIterations; i++) {
+ final TokenStream ts = analyzer.tokenStream("ignored", line);
+ ts.reset();
+ while(ts.incrementToken());
+ }
+ }
+ */
+
+ long totalStart = Environment.TickCount;
+ for (int i = 0; i < numIterations; i++)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", line);
+ try
+ {
+ ts.Reset();
+ while (ts.IncrementToken()) ;
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+ String[] sentences = Regex.Split(line, "、|。");
+ if (VERBOSE)
+ {
+ Console.WriteLine("Total time : " + (Environment.TickCount - totalStart));
+ Console.WriteLine("Test for Bocchan with pre-splitting sentences (" + sentences.Length + " sentences)");
+ }
+ totalStart = Environment.TickCount;
+ for (int i = 0; i < numIterations; i++)
+ {
+ foreach (String sentence in sentences)
+ {
+ TokenStream ts = analyzer.GetTokenStream("ignored", sentence);
+ try
+ {
+ ts.Reset();
+ while (ts.IncrementToken()) ;
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.DisposeWhileHandlingException(ts);
+ }
+ }
+ }
+ if (VERBOSE)
+ {
+ Console.WriteLine("Total time : " + (Environment.TickCount - totalStart));
+ }
+ }
+
+ [Test]
+ public void TestWithPunctuation()
+ {
+ AssertAnalyzesTo(analyzerNoPunct, "羽田。空港",
+ new String[] { "羽田", "空港" },
+ new int[] { 1, 1 });
+ }
+
+ [Test]
+ public void TestCompoundOverPunctuation()
+ {
+ AssertAnalyzesToPositions(analyzerNoPunct, "dεε϶ϢϏΎϷΞͺ羽田",
+ new String[] { "d", "ε", "ε", "ϢϏΎϷΞͺ", "羽田" },
+ new int[] { 1, 1, 1, 1, 1 },
+ new int[] { 1, 1, 1, 1, 1 });
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs
new file mode 100644
index 0000000..91fbf16
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestJapaneseTokenizerFactory.cs
@@ -0,0 +1,134 @@
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Simple tests for <see cref="JapaneseTokenizerFactory"/>
+ /// </summary>
+ public class TestJapaneseTokenizerFactory : BaseTokenStreamTestCase
+ {
+ [Test]
+ public void TestSimple()
+ {
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ factory.Inform(new StringMockResourceLoader(""));
+ TokenStream ts = factory.Create(new StringReader("これは本ではない"));
+ AssertTokenStreamContents(ts,
+ new String[] { "これ", "は", "本", "で", "は", "ない" },
+ new int[] { 0, 2, 3, 4, 5, 6 },
+ new int[] { 2, 3, 4, 5, 6, 8 }
+ );
+ }
+
+ /**
+ * Test that search mode is enabled and working by default
+ */
+ [Test]
+ public void TestDefaults()
+ {
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(new Dictionary<String, String>());
+ factory.Inform(new StringMockResourceLoader(""));
+ TokenStream ts = factory.Create(new StringReader("シニアソフトウェアエンジニア"));
+ AssertTokenStreamContents(ts,
+ new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
+ );
+ }
+
+ /**
+ * Test mode parameter: specifying normal mode
+ */
+ [Test]
+ public void TestMode()
+ {
+ IDictionary<String, String> args = new Dictionary<String, String>();
+ args.Put("mode", "normal");
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
+ factory.Inform(new StringMockResourceLoader(""));
+ TokenStream ts = factory.Create(new StringReader("シニアソフトウェアエンジニア"));
+ AssertTokenStreamContents(ts,
+ new String[] { "シニアソフトウェアエンジニア" }
+ );
+ }
+
+ /**
+ * Test user dictionary
+ */
+ [Test]
+ public void TestUserDict()
+ {
+ String userDict =
+ "# Custom segmentation for long entries\n" +
+ "日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞\n" +
+ "関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞\n" +
+ "# Custom reading for sumo wrestler\n" +
+ "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
+ IDictionary<String, String> args = new Dictionary<String, String>();
+ args.Put("userDictionary", "userdict.txt");
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
+ factory.Inform(new StringMockResourceLoader(userDict));
+ TokenStream ts = factory.Create(new StringReader("関西国際空港に行った"));
+ AssertTokenStreamContents(ts,
+ new String[] { "関西", "国際", "空港", "に", "行っ", "た" }
+ );
+ }
+
+ /**
+ * Test preserving punctuation
+ */
+ [Test]
+ public void TestPreservePunctuation()
+ {
+ IDictionary<String, String> args = new Dictionary<String, String>();
+ args.Put("discardPunctuation", "false");
+ JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(args);
+ factory.Inform(new StringMockResourceLoader(""));
+ TokenStream ts = factory.Create(
+ new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。")
+ );
+ AssertTokenStreamContents(ts,
+ new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、",
+ "来週", "の", "頭", "日本", "に", "戻り", "ます", "。",
+ "楽しみ", "に", "し", "て", "い", "ます", "!",
+ "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。" }
+ );
+ }
+
+ /** Test that bogus arguments result in exception */
+ [Test]
+ public void TestBogusArguments()
+ {
+ try
+ {
+ new JapaneseTokenizerFactory(new Dictionary<String, String>() {
+ { "bogusArg", "bogusValue" }
+ });
+ fail();
+ }
+ catch (ArgumentException expected)
+ {
+ assertTrue(expected.Message.Contains("Unknown parameters"));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs
new file mode 100644
index 0000000..bb9fdae
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Kuromoji/TestSearchMode.cs
@@ -0,0 +1,92 @@
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class TestSearchMode : BaseTokenStreamTestCase
+ {
+ private readonly static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
+ private readonly Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ });
+
+
+ /** Test search mode segmentation */
+ [Test]
+ public void TestSearchSegmentation()
+ {
+ Stream @is = typeof(TestSearchMode).getResourceAsStream(SEGMENTATION_FILENAME);
+ if (@is == null)
+ {
+ throw new FileNotFoundException("Cannot find " + SEGMENTATION_FILENAME + " in test classpath");
+ }
+ try
+ {
+ TextReader reader = new StreamReader(@is, Encoding.UTF8);
+ String line = null;
+ int lineNumber = 0;
+
+ while ((line = reader.ReadLine()) != null)
+ {
+ lineNumber++;
+ // Remove comments
+ line = Regex.Replace(line, "#.*$", "");
+ // Skip empty lines or comment lines
+ if (line.Trim() == string.Empty)
+ {
+ continue;
+ }
+ if (VERBOSE)
+ {
+ Console.WriteLine("Line no. " + lineNumber + ": " + line);
+ }
+ String[] fields = new Regex("\t").Split(line, 2); // Regex.Split(line, "\t", 2);
+ String sourceText = fields[0];
+ String[] expectedTokens = Regex.Split(fields[1], "\\s+");
+ int[] expectedPosIncrs = new int[expectedTokens.Length];
+ int[] expectedPosLengths = new int[expectedTokens.Length];
+ for (int tokIDX = 0; tokIDX < expectedTokens.Length; tokIDX++)
+ {
+ if (expectedTokens[tokIDX].EndsWith("/0", StringComparison.Ordinal))
+ {
+ expectedTokens[tokIDX] = Regex.Replace(expectedTokens[tokIDX], "/0", "");
+ expectedPosLengths[tokIDX] = expectedTokens.Length - 1;
+ }
+ else
+ {
+ expectedPosIncrs[tokIDX] = 1;
+ expectedPosLengths[tokIDX] = 1;
+ }
+ }
+ AssertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs);
+ }
+ }
+ finally
+ {
+ @is.Dispose();
+ }
+ }
+ }
+}