You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/23 17:36:35 UTC
[10/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji +
tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs
new file mode 100644
index 0000000..5524be7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilterFactory.cs
@@ -0,0 +1,52 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="JapaneseBaseFormFilter"/>.
+ /// <code>
+ /// <fieldType name="text_ja" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ /// <filter class="solr.JapaneseBaseFormFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class JapaneseBaseFormFilterFactory : TokenFilterFactory
+ {
+ /// <summary>Creates a new <see cref="JapaneseBaseFormFilterFactory"/></summary>
+ public JapaneseBaseFormFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new JapaneseBaseFormFilter(input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs
new file mode 100644
index 0000000..71566bb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilter.cs
@@ -0,0 +1,500 @@
+using Lucene.Net.Analysis.Util;
+using System.Diagnostics;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
+ /// </summary>
+ /// <remarks>
+ /// Sequences of iteration marks are supported. In case an illegal sequence of iteration
+ /// marks is encountered, the implementation emits the illegal source character as-is
+ /// without considering its script. For example, with input "?ゝ", we get
+ /// "??" even though "?" isn't hiragana.
+ /// <para/>
+ /// Note that a full stop punctuation character "。" (U+3002) can not be iterated
+ /// (see below). Iteration marks themselves can be emitted in case they are illegal,
+ /// i.e. if they go back past the beginning of the character stream.
+ /// <para/>
+ /// The implementation buffers input until a full stop punctuation character (U+3002)
+ /// or EOF is reached in order to not keep a copy of the character stream in memory.
+ /// Vertical iteration marks, which are even rarer than horizontal iteration marks in
+ /// contemporary Japanese, are unsupported.
+ /// </remarks>
+ public class JapaneseIterationMarkCharFilter : CharFilter
+ {
+ /// <summary>Normalize kanji iteration marks by default</summary>
+ public static readonly bool NORMALIZE_KANJI_DEFAULT = true;
+
+ /// <summary>Normalize kana iteration marks by default</summary>
+ public static readonly bool NORMALIZE_KANA_DEFAULT = true;
+
+ private const char KANJI_ITERATION_MARK = '\u3005'; // 々
+
+ private const char HIRAGANA_ITERATION_MARK = '\u309d'; // ゝ
+
+ private const char HIRAGANA_VOICED_ITERATION_MARK = '\u309e'; // ゞ
+
+ private const char KATAKANA_ITERATION_MARK = '\u30fd'; // ヽ
+
+ private const char KATAKANA_VOICED_ITERATION_MARK = '\u30fe'; // ヾ
+
+ private const char FULL_STOP_PUNCTUATION = '\u3002'; // 。
+
+ // Hiragana to dakuten map (lookup using code point - 0x30ab(か)*/
+ private static char[] h2d = new char[50];
+
+ // Katakana to dakuten map (lookup using code point - 0x30ab(カ
+ private static char[] k2d = new char[50];
+
+ private readonly RollingCharBuffer buffer = new RollingCharBuffer();
+
+ private int bufferPosition = 0;
+
+ private int iterationMarksSpanSize = 0;
+
+ private int iterationMarkSpanEndPosition = 0;
+
+ private bool normalizeKanji;
+
+ private bool normalizeKana;
+
+ static JapaneseIterationMarkCharFilter()
+ {
+ // Hiragana dakuten map
+ h2d[0] = '\u304c'; // か => が
+ h2d[1] = '\u304c'; // が => が
+ h2d[2] = '\u304e'; // き => ぎ
+ h2d[3] = '\u304e'; // ぎ => ぎ
+ h2d[4] = '\u3050'; // く => ぐ
+ h2d[5] = '\u3050'; // ぐ => ぐ
+ h2d[6] = '\u3052'; // け => げ
+ h2d[7] = '\u3052'; // げ => げ
+ h2d[8] = '\u3054'; // こ => ご
+ h2d[9] = '\u3054'; // ご => ご
+ h2d[10] = '\u3056'; // さ => ざ
+ h2d[11] = '\u3056'; // ざ => ざ
+ h2d[12] = '\u3058'; // し => じ
+ h2d[13] = '\u3058'; // じ => じ
+ h2d[14] = '\u305a'; // す => ず
+ h2d[15] = '\u305a'; // ず => ず
+ h2d[16] = '\u305c'; // せ => ぜ
+ h2d[17] = '\u305c'; // ぜ => ぜ
+ h2d[18] = '\u305e'; // そ => ぞ
+ h2d[19] = '\u305e'; // ぞ => ぞ
+ h2d[20] = '\u3060'; // た => だ
+ h2d[21] = '\u3060'; // だ => だ
+ h2d[22] = '\u3062'; // ち => ぢ
+ h2d[23] = '\u3062'; // ぢ => ぢ
+ h2d[24] = '\u3063';
+ h2d[25] = '\u3065'; // つ => づ
+ h2d[26] = '\u3065'; // づ => づ
+ h2d[27] = '\u3067'; // て => で
+ h2d[28] = '\u3067'; // で => で
+ h2d[29] = '\u3069'; // と => ど
+ h2d[30] = '\u3069'; // ど => ど
+ h2d[31] = '\u306a';
+ h2d[32] = '\u306b';
+ h2d[33] = '\u306c';
+ h2d[34] = '\u306d';
+ h2d[35] = '\u306e';
+ h2d[36] = '\u3070'; // は => ば
+ h2d[37] = '\u3070'; // ば => ば
+ h2d[38] = '\u3071';
+ h2d[39] = '\u3073'; // ひ => び
+ h2d[40] = '\u3073'; // び => び
+ h2d[41] = '\u3074';
+ h2d[42] = '\u3076'; // ふ => ぶ
+ h2d[43] = '\u3076'; // ぶ => ぶ
+ h2d[44] = '\u3077';
+ h2d[45] = '\u3079'; // へ => べ
+ h2d[46] = '\u3079'; // べ => べ
+ h2d[47] = '\u307a';
+ h2d[48] = '\u307c'; // ほ => ぼ
+ h2d[49] = '\u307c'; // ぼ => ぼ
+
+ // Make katakana dakuten map from hiragana map
+ char codePointDifference = (char)('\u30ab' - '\u304b'); // カ - か
+ Debug.Assert(h2d.Length == k2d.Length);
+ for (int i = 0; i < k2d.Length; i++)
+ {
+ k2d[i] = (char)(h2d[i] + codePointDifference);
+ }
+ }
+
+ /// <summary>
+ /// Constructor. Normalizes both kanji and kana iteration marks by default.
+ /// </summary>
+ /// <param name="input">Char stream.</param>
+ public JapaneseIterationMarkCharFilter(TextReader input)
+ : this(input, NORMALIZE_KANJI_DEFAULT, NORMALIZE_KANA_DEFAULT)
+ {
+ }
+
+ /// <summary>
+ /// Constructor
+ /// </summary>
+ /// <param name="input">Char stream.</param>
+ /// <param name="normalizeKanji">Indicates whether kanji iteration marks should be normalized.</param>
+ /// <param name="normalizeKana">Indicates whether kana iteration marks should be normalized.</param>
+ public JapaneseIterationMarkCharFilter(TextReader input, bool normalizeKanji, bool normalizeKana)
+ : base(input)
+ {
+ this.normalizeKanji = normalizeKanji;
+ this.normalizeKana = normalizeKana;
+ buffer.Reset(input);
+ }
+
+ /// <summary>
+ /// Reads a specified maximum number of characters from the current reader and writes the data to a buffer, beginning at the specified index.
+ /// </summary>
+ /// <param name="buffer">
+ /// When this method returns, contains the specified character array with the values between index and (index + count - 1)
+ /// replaced by the characters read from the current source.</param>
+ /// <param name="offset">
+ /// The position in buffer at which to begin writing.
+ /// </param>
+ /// <param name="length">
+ /// The maximum number of characters to read. If the end of the reader is reached before the specified number of characters is
+ /// read into the buffer, the method returns.
+ /// </param>
+ /// <returns>
+ /// The number of characters that have been read. The number will be less than or equal to count, depending on whether the data is
+ /// available within the reader. This method returns 0 (zero) if it is called when no more characters are left to read.
+ /// </returns>
+ public override int Read(char[] buffer, int offset, int length)
+ {
+ int read = 0;
+
+ for (int i = offset; i < offset + length; i++)
+ {
+ int c = Read();
+ if (c == -1)
+ {
+ break;
+ }
+ buffer[i] = (char)c;
+ read++;
+ }
+
+ return read == 0 ? -1 : read;
+ }
+
+ /// <summary>
+ /// Reads the next character from the text reader and advances the character position by one character.
+ /// </summary>
+ /// <returns>The next character from the text reader, or -1 if no more characters are available.</returns>
+ public override int Read()
+ {
+ int ic = buffer.Get(bufferPosition);
+
+ // End of input
+ if (ic == -1)
+ {
+ buffer.FreeBefore(bufferPosition);
+ return ic;
+ }
+
+ char c = (char)ic;
+
+ // Skip surrogate pair characters
+ if (char.IsHighSurrogate(c) || char.IsLowSurrogate(c))
+ {
+ iterationMarkSpanEndPosition = bufferPosition + 1;
+ }
+
+ // Free rolling buffer on full stop
+ if (c == FULL_STOP_PUNCTUATION)
+ {
+ buffer.FreeBefore(bufferPosition);
+ iterationMarkSpanEndPosition = bufferPosition + 1;
+ }
+
+ // Normalize iteration mark
+ if (IsIterationMark(c))
+ {
+ c = NormalizeIterationMark(c);
+ }
+
+ bufferPosition++;
+ return c;
+ }
+
+ /// <summary>
+ /// Normalizes the iteration mark character <paramref name="c"/>
+ /// </summary>
+ /// <param name="c">Iteration mark character to normalize.</param>
+ /// <returns>Normalized iteration mark.</returns>
+ /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+ private char NormalizeIterationMark(char c)
+ {
+
+ // Case 1: Inside an iteration mark span
+ if (bufferPosition < iterationMarkSpanEndPosition)
+ {
+ return Normalize(SourceCharacter(bufferPosition, iterationMarksSpanSize), c);
+ }
+
+ // Case 2: New iteration mark spans starts where the previous one ended, which is illegal
+ if (bufferPosition == iterationMarkSpanEndPosition)
+ {
+ // Emit the illegal iteration mark and increase end position to indicate that we can't
+ // start a new span on the next position either
+ iterationMarkSpanEndPosition++;
+ return c;
+ }
+
+ // Case 3: New iteration mark span
+ iterationMarksSpanSize = NextIterationMarkSpanSize();
+ iterationMarkSpanEndPosition = bufferPosition + iterationMarksSpanSize;
+ return Normalize(SourceCharacter(bufferPosition, iterationMarksSpanSize), c);
+ }
+
+ /// <summary>
+ /// Finds the number of subsequent next iteration marks
+ /// </summary>
+ /// <returns>Number of iteration marks starting at the current buffer position.</returns>
+ /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+ private int NextIterationMarkSpanSize()
+ {
+ int spanSize = 0;
+ for (int i = bufferPosition; buffer.Get(i) != -1 && IsIterationMark((char)(buffer.Get(i))); i++)
+ {
+ spanSize++;
+ }
+ // Restrict span size so that we don't go past the previous end position
+ if (bufferPosition - spanSize < iterationMarkSpanEndPosition)
+ {
+ spanSize = bufferPosition - iterationMarkSpanEndPosition;
+ }
+ return spanSize;
+ }
+
+ /// <summary>
+ /// Returns the source character for a given position and iteration mark span size.
+ /// </summary>
+ /// <param name="position">Buffer position (should not exceed bufferPosition).</param>
+ /// <param name="spanSize">Iteration mark span size.</param>
+ /// <returns>Source character.</returns>
+ /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+ private char SourceCharacter(int position, int spanSize)
+ {
+ return (char)buffer.Get(position - spanSize);
+ }
+
+ /// <summary>
+ /// Normalize a character.
+ /// </summary>
+ /// <param name="c">Character to normalize.</param>
+ /// <param name="m">Repetition mark referring to <paramref name="c"/>.</param>
+ /// <returns>Normalized character - return c on illegal iteration marks.</returns>
+ private char Normalize(char c, char m)
+ {
+ if (IsHiraganaIterationMark(m))
+ {
+ return NormalizedHiragana(c, m);
+ }
+
+ if (IsKatakanaIterationMark(m))
+ {
+ return NormalizedKatakana(c, m);
+ }
+
+ return c; // If m is not kana and we are to normalize it, we assume it is kanji and simply return it
+ }
+
+ /// <summary>
+ /// Normalize hiragana character.
+ /// </summary>
+ /// <param name="c">Hiragana character.</param>
+ /// <param name="m">Repetition mark referring to <paramref name="c"/>.</param>
+ /// <returns>Normalized character - return <paramref name="c"/> on illegal iteration marks.</returns>
+ private char NormalizedHiragana(char c, char m)
+ {
+ switch (m)
+ {
+ case HIRAGANA_ITERATION_MARK:
+ return IsHiraganaDakuten(c) ? (char)(c - 1) : c;
+ case HIRAGANA_VOICED_ITERATION_MARK:
+ return LookupHiraganaDakuten(c);
+ default:
+ return c;
+ }
+ }
+
+ /// <summary>
+ /// Normalize katakana character.
+ /// </summary>
+ /// <param name="c">Katakana character.</param>
+ /// <param name="m">Repetition mark referring to <paramref name="c"/>.</param>
+ /// <returns>Normalized character - return <paramref name="c"/> on illegal iteration marks.</returns>
+ private char NormalizedKatakana(char c, char m)
+ {
+ switch (m)
+ {
+ case KATAKANA_ITERATION_MARK:
+ return IsKatakanaDakuten(c) ? (char)(c - 1) : c;
+ case KATAKANA_VOICED_ITERATION_MARK:
+ return LookupKatakanaDakuten(c);
+ default:
+ return c;
+ }
+ }
+
+ /// <summary>
+ /// Iteration mark character predicate.
+ /// </summary>
+ /// <param name="c">Character to test.</param>
+ /// <returns><c>true</c> if <paramref name="c"/> is an iteration mark character. Otherwise <c>false</c>.</returns>
+ private bool IsIterationMark(char c)
+ {
+ return IsKanjiIterationMark(c) || IsHiraganaIterationMark(c) || IsKatakanaIterationMark(c);
+ }
+
+ /// <summary>
+ /// Hiragana iteration mark character predicate.
+ /// </summary>
+ /// <param name="c">Character to test.</param>
+ /// <returns><c>true</c> if <paramref name="c"/> is a hiragana iteration mark character. Otherwise <c>false</c>.</returns>
+ private bool IsHiraganaIterationMark(char c)
+ {
+ if (normalizeKana)
+ {
+ return c == HIRAGANA_ITERATION_MARK || c == HIRAGANA_VOICED_ITERATION_MARK;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Katakana iteration mark character predicate.
+ /// </summary>
+ /// <param name="c">Character to test.</param>
+ /// <returns><c>true</c> if c is a katakana iteration mark character. Otherwise <c>false</c>.</returns>
+ private bool IsKatakanaIterationMark(char c)
+ {
+ if (normalizeKana)
+ {
+ return c == KATAKANA_ITERATION_MARK || c == KATAKANA_VOICED_ITERATION_MARK;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Kanji iteration mark character predicate.
+ /// </summary>
+ /// <param name="c">Character to test.</param>
+ /// <returns><c>true</c> if c is a kanji iteration mark character. Otherwise <c>false</c>.</returns>
+ private bool IsKanjiIterationMark(char c)
+ {
+ if (normalizeKanji)
+ {
+ return c == KANJI_ITERATION_MARK;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Look up hiragana dakuten.
+ /// </summary>
+ /// <param name="c">Character to look up.</param>
+ /// <returns>Hiragana dakuten variant of c or c itself if no dakuten variant exists.</returns>
+ private char LookupHiraganaDakuten(char c)
+ {
+ return Lookup(c, h2d, '\u304b'); // Code point is for か
+ }
+
+ /// <summary>
+ /// Look up katakana dakuten. Only full-width katakana are supported.
+ /// </summary>
+ /// <param name="c">Character to look up.</param>
+ /// <returns>Katakana dakuten variant of <paramref name="c"/> or <paramref name="c"/> itself if no dakuten variant exists.</returns>
+ private char LookupKatakanaDakuten(char c)
+ {
+ return Lookup(c, k2d, '\u30ab'); // Code point is for カ
+ }
+
+ /// <summary>
+ /// Hiragana dakuten predicate.
+ /// </summary>
+ /// <param name="c">Character to check.</param>
+ /// <returns><c>true</c> if c is a hiragana dakuten and otherwise <c>false</c>.</returns>
+ private bool IsHiraganaDakuten(char c)
+ {
+ return Inside(c, h2d, '\u304b') && c == LookupHiraganaDakuten(c);
+ }
+
+ /// <summary>
+ /// Katakana dakuten predicate.
+ /// </summary>
+ /// <param name="c">Character to check.</param>
+ /// <returns><c>true</c> if c is a hiragana dakuten and otherwise <c>false</c>.</returns>
+ private bool IsKatakanaDakuten(char c)
+ {
+ return Inside(c, k2d, '\u30ab') && c == LookupKatakanaDakuten(c);
+ }
+
+ /// <summary>
+ /// Looks up a character in dakuten map and returns the dakuten variant if it exists.
+ /// Otherwise return the character being looked up itself.
+ /// </summary>
+ /// <param name="c">Character to look up.</param>
+ /// <param name="map">Dakuten map.</param>
+ /// <param name="offset">Code point offset from <paramref name="c"/>.</param>
+ /// <returns>Mapped character or <paramref name="c"/> if no mapping exists.</returns>
+ private char Lookup(char c, char[] map, char offset)
+ {
+ if (!Inside(c, map, offset))
+ {
+ return c;
+ }
+ else
+ {
+ return map[c - offset];
+ }
+ }
+
+ /// <summary>
+ /// Predicate indicating if the lookup character is within dakuten map range.
+ /// </summary>
+ /// <param name="c">Character to look up.</param>
+ /// <param name="map">Dakuten map.</param>
+ /// <param name="offset">Code point offset from <paramref name="c"/>.</param>
+ /// <returns><c>true</c> if <paramref name="c"/> is mapped by map and otherwise <c>false</c>.</returns>
+ private bool Inside(char c, char[] map, char offset)
+ {
+ return c >= offset && c < offset + map.Length;
+ }
+
+ protected override int Correct(int currentOff)
+ {
+ return currentOff; // this filter doesn't change the length of strings
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs
new file mode 100644
index 0000000..c9518c9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseIterationMarkCharFilterFactory.cs
@@ -0,0 +1,66 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="JapaneseIterationMarkCharFilter"/>.
+ /// <code>
+ /// <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+ /// <analyzer>
+ /// <charFilter class="solr.JapaneseIterationMarkCharFilterFactory normalizeKanji="true" normalizeKana="true"/>
+ /// <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class JapaneseIterationMarkCharFilterFactory : CharFilterFactory, IMultiTermAwareComponent
+ {
+ private static readonly string NORMALIZE_KANJI_PARAM = "normalizeKanji";
+ private static readonly string NORMALIZE_KANA_PARAM = "normalizeKana";
+
+ private readonly bool normalizeKanji;
+ private readonly bool normalizeKana;
+
+ /// <summary>Creates a new <see cref="JapaneseIterationMarkCharFilterFactory"/></summary>
+ public JapaneseIterationMarkCharFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ normalizeKanji = GetBoolean(args, NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT);
+ normalizeKana = GetBoolean(args, NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TextReader Create(TextReader input)
+ {
+ return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana);
+ }
+
+ public virtual AbstractAnalysisFactory GetMultiTermComponent()
+ {
+ return this;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs
new file mode 100644
index 0000000..857e5bf
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilter.cs
@@ -0,0 +1,111 @@
+using Lucene.Net.Analysis.TokenAttributes;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <see cref="TokenFilter"/> that normalizes common katakana spelling variations
+ /// ending in a long sound character by removing this character (U+30FC). Only
+ /// katakana words longer than a minimum length are stemmed (default is four).
+ /// </summary>
+ /// <remarks>
+ /// Note that only full-width katakana characters are supported. Please use a
+ /// <see cref="Cjk.CJKWidthFilter"/> to convert half-width
+ /// katakana to full-width before using this filter.
+ /// <para/>
+ /// In order to prevent terms from being stemmed, use an instance of
+ /// <see cref="Miscellaneous.SetKeywordMarkerFilter"/>
+ /// or a custom <see cref="TokenFilter"/> that sets the <see cref="IKeywordAttribute"/>
+ /// before this <see cref="TokenStream"/>.
+ /// </remarks>
+ public sealed class JapaneseKatakanaStemFilter : TokenFilter
+ {
+ public readonly static int DEFAULT_MINIMUM_LENGTH = 4;
+ private readonly static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc';
+
+ private readonly ICharTermAttribute termAttr;
+ private readonly IKeywordAttribute keywordAttr;
+ private readonly int minimumKatakanaLength;
+
+ private readonly static Regex katakanaPattern = new Regex(@"\p{IsKatakana}", RegexOptions.Compiled | RegexOptions.CultureInvariant);
+
+ public JapaneseKatakanaStemFilter(TokenStream input, int minimumLength)
+ : base(input)
+ {
+ this.minimumKatakanaLength = minimumLength;
+ this.termAttr = AddAttribute<ICharTermAttribute>();
+ this.keywordAttr = AddAttribute<IKeywordAttribute>();
+ }
+
+ public JapaneseKatakanaStemFilter(TokenStream input)
+ : this(input, DEFAULT_MINIMUM_LENGTH)
+ {
+ }
+
+ public override bool IncrementToken()
+ {
+ if (m_input.IncrementToken())
+ {
+ if (!keywordAttr.IsKeyword)
+ {
+ termAttr.SetLength(Stem(termAttr.Buffer, termAttr.Length));
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ private int Stem(char[] term, int length)
+ {
+ if (length < minimumKatakanaLength)
+ {
+ return length;
+ }
+
+ if (!IsKatakana(term, length))
+ {
+ return length;
+ }
+
+ if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK)
+ {
+ return length - 1;
+ }
+
+ return length;
+ }
+
+ private bool IsKatakana(char[] term, int length)
+ {
+ for (int i = 0; i < length; i++)
+ {
+ // NOTE: Test only identifies full-width characters -- half-widths are supported
+ if (!katakanaPattern.IsMatch(term[i].ToString()))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs
new file mode 100644
index 0000000..af2acb5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseKatakanaStemFilterFactory.cs
@@ -0,0 +1,61 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="JapaneseKatakanaStemFilter"/>.
+ /// <code>
+ /// <fieldType name="text_ja" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ /// <filter class="solr.JapaneseKatakanaStemFilterFactory"
+ /// minimumLength="4"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class JapaneseKatakanaStemFilterFactory : TokenFilterFactory
+ {
+ private static readonly string MINIMUM_LENGTH_PARAM = "minimumLength";
+ private readonly int minimumLength;
+
+ /// <summary>Creates a new <see cref="JapaneseKatakanaStemFilterFactory"/></summary>
+ public JapaneseKatakanaStemFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ minimumLength = GetInt32(args, MINIMUM_LENGTH_PARAM, JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
+ if (minimumLength < 2)
+ {
+ throw new ArgumentException("Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)");
+ }
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new JapaneseKatakanaStemFilter(input, minimumLength);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs
new file mode 100644
index 0000000..2b1ccc4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilter.cs
@@ -0,0 +1,61 @@
+using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Removes tokens that match a set of part-of-speech tags.
+ /// </summary>
+ public sealed class JapanesePartOfSpeechStopFilter : FilteringTokenFilter
+ {
+ private readonly ISet<string> stopTags;
+ private readonly IPartOfSpeechAttribute posAtt;
+
+ [Obsolete("EnablePositionIncrements=false is not supported anymore as of Lucene 4.4.")]
+ public JapanesePartOfSpeechStopFilter(LuceneVersion version, bool enablePositionIncrements, TokenStream input, ISet<string> stopTags)
+ : base(version, enablePositionIncrements, input)
+ {
+ this.stopTags = stopTags;
+ this.posAtt = AddAttribute<IPartOfSpeechAttribute>();
+ }
+
+ /// <summary>
+ /// Create a new <see cref="JapanesePartOfSpeechStopFilter"/>.
+ /// </summary>
+ /// <param name="version">The Lucene match version.</param>
+ /// <param name="input">The <see cref="TokenStream"/> to consume.</param>
+ /// <param name="stopTags">The part-of-speech tags that should be removed.</param>
+ public JapanesePartOfSpeechStopFilter(LuceneVersion version, TokenStream input, ISet<string> stopTags)
+ : base(version, input)
+ {
+ this.stopTags = stopTags;
+ this.posAtt = AddAttribute<IPartOfSpeechAttribute>();
+ }
+
+ protected override bool Accept()
+ {
+ string pos = posAtt.GetPartOfSpeech();
+ return pos == null || !stopTags.Contains(pos);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs
new file mode 100644
index 0000000..04fc900
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapanesePartOfSpeechStopFilterFactory.cs
@@ -0,0 +1,85 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="JapanesePartOfSpeechStopFilter"/>.
+ /// <code>
+ /// <fieldType name="text_ja" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ /// <filter class="solr.JapanesePartOfSpeechStopFilterFactory"
+ /// tags="stopTags.txt"
+ /// enablePositionIncrements="true"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class JapanesePartOfSpeechStopFilterFactory : TokenFilterFactory, IResourceLoaderAware
+ {
+ private readonly string stopTagFiles;
+ private readonly bool enablePositionIncrements;
+ private ISet<string> stopTags;
+
+ /// <summary>Creates a new JapanesePartOfSpeechStopFilterFactory</summary>
+ public JapanesePartOfSpeechStopFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ stopTagFiles = Get(args, "tags");
+ enablePositionIncrements = GetBoolean(args, "enablePositionIncrements", true);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public virtual void Inform(IResourceLoader loader)
+ {
+ stopTags = null;
+ CharArraySet cas = GetWordSet(loader, stopTagFiles, false);
+ if (cas != null)
+ {
+ stopTags = new HashSet<string>();
+ foreach (string element in cas)
+ {
+ stopTags.Add(element);
+ }
+ }
+ }
+
+ public override TokenStream Create(TokenStream stream)
+ {
+ // if stoptags is null, it means the file is empty
+ if (stopTags != null)
+ {
+#pragma warning disable 612, 618
+ TokenStream filter = new JapanesePartOfSpeechStopFilter(m_luceneMatchVersion, enablePositionIncrements, stream, stopTags);
+#pragma warning restore 612, 618
+ return filter;
+ }
+ else
+ {
+ return stream;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs
new file mode 100644
index 0000000..b2e1542
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilter.cs
@@ -0,0 +1,89 @@
+using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Analysis.TokenAttributes;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <see cref="TokenFilter"/> that replaces the term
+ /// attribute with the reading of a token in either katakana or romaji form.
+ /// The default reading form is katakana.
+ /// </summary>
+ public sealed class JapaneseReadingFormFilter : TokenFilter
+ {
+ private readonly ICharTermAttribute termAttr;
+ private readonly IReadingAttribute readingAttr;
+
+ private StringBuilder buffer = new StringBuilder();
+ private bool useRomaji;
+
+ public JapaneseReadingFormFilter(TokenStream input, bool useRomaji)
+ : base(input)
+ {
+ this.useRomaji = useRomaji;
+ this.termAttr = AddAttribute<ICharTermAttribute>();
+ this.readingAttr = AddAttribute<IReadingAttribute>();
+ }
+
+ public JapaneseReadingFormFilter(TokenStream input)
+ : this(input, false)
+ {
+ }
+
+ public override bool IncrementToken()
+ {
+ if (m_input.IncrementToken())
+ {
+ string reading = readingAttr.GetReading();
+
+ if (useRomaji)
+ {
+ if (reading == null)
+ {
+ // if its an OOV term, just try the term text
+ buffer.Length = 0;
+ ToStringUtil.GetRomanization(buffer, termAttr.ToString());
+ termAttr.SetEmpty().Append(buffer);
+ }
+ else
+ {
+ buffer.Length = 0;
+ ToStringUtil.GetRomanization(buffer, reading);
+ termAttr.SetEmpty().Append(buffer);
+ }
+ }
+ else
+ {
+ // just replace the term text with the reading, if it exists
+ if (reading != null)
+ {
+ termAttr.SetEmpty().Append(reading);
+ }
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs
new file mode 100644
index 0000000..9464c2e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseReadingFormFilterFactory.cs
@@ -0,0 +1,57 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="JapaneseReadingFormFilter"/>.
+ /// <code>
+ /// <fieldType name="text_ja" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ /// <filter class="solr.JapaneseReadingFormFilterFactory"
+ /// useRomaji="false"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class JapaneseReadingFormFilterFactory : TokenFilterFactory
+ {
+ private static readonly string ROMAJI_PARAM = "useRomaji";
+ private readonly bool useRomaji;
+
+ /// <summary>Creates a new <see cref="JapaneseReadingFormFilterFactory"/>.</summary>
+ public JapaneseReadingFormFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ useRomaji = GetBoolean(args, ROMAJI_PARAM, false);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new JapaneseReadingFormFilter(input, useRomaji);
+ }
+ }
+}