You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/23 17:36:34 UTC
[09/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji +
tests
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
new file mode 100644
index 0000000..4690549
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs
@@ -0,0 +1,1489 @@
+using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
+using System.IO;
+using System.Linq;
+using System.Threading;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Tokenizer for Japanese that uses morphological analysis.
+ /// </summary>
+ /// <remarks>
+ /// This tokenizer sets a number of additional attributes:
+ /// <list type="bullet">
+ /// <item><description><see cref="IBaseFormAttribute"/> containing base form for inflected adjectives and verbs.</description></item>
+ /// <item><description><see cref="IPartOfSpeechAttribute"/> containing part-of-speech.</description></item>
+ /// <item><description><see cref="IReadingAttribute"/> containing reading and pronunciation.</description></item>
+ /// <item><description><see cref="IInflectionAttribute"/> containing additional part-of-speech information for inflected forms.</description></item>
+ /// </list>
+ /// <para/>
+ /// This tokenizer uses a rolling Viterbi search to find the
+ /// least cost segmentation (path) of the incoming characters.
+ /// For tokens that appear to be compound (> length 2 for all
+ /// Kanji, or > length 7 for non-Kanji), we see if there is a
+ /// 2nd best segmentation of that token after applying
+ /// penalties to the long tokens. If so, and the Mode is
+ /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation
+ /// as well.
+ /// </remarks>
+ public sealed class JapaneseTokenizer : Tokenizer
+ {
+ // LUCENENET specific: de-nested Mode and renamed JapaneseTokenizerMode
+
+ /// <summary>
+ /// Default tokenization mode. Currently this is <see cref="JapaneseTokenizerMode.SEARCH"/>.
+ /// </summary>
+ public static readonly JapaneseTokenizerMode DEFAULT_MODE = JapaneseTokenizerMode.SEARCH;
+
+ // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType
+
+
+ private static readonly bool VERBOSE = false;
+
+ private static readonly int SEARCH_MODE_KANJI_LENGTH = 2;
+
+ private static readonly int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
+
+ private static readonly int SEARCH_MODE_KANJI_PENALTY = 3000;
+
+ private static readonly int SEARCH_MODE_OTHER_PENALTY = 1700;
+
+ // For safety:
+ private static readonly int MAX_UNKNOWN_WORD_LENGTH = 1024;
+ private static readonly int MAX_BACKTRACE_GAP = 1024;
+
+ private readonly IDictionary<JapaneseTokenizerType, IDictionary> dictionaryMap = new Dictionary<JapaneseTokenizerType, IDictionary>();
+
+ private readonly TokenInfoFST fst;
+ private readonly TokenInfoDictionary dictionary;
+ private readonly UnknownDictionary unkDictionary;
+ private readonly ConnectionCosts costs;
+ private readonly UserDictionary userDictionary;
+ private readonly CharacterDefinition characterDefinition;
+
+ private readonly FST.Arc<long?> arc = new FST.Arc<long?>();
+ private readonly FST.BytesReader fstReader;
+ private readonly Int32sRef wordIdRef = new Int32sRef();
+
+ private readonly FST.BytesReader userFSTReader;
+ private readonly TokenInfoFST userFST;
+
+ private readonly RollingCharBuffer buffer = new RollingCharBuffer();
+
+ private readonly WrappedPositionArray positions = new WrappedPositionArray();
+
+ private readonly bool discardPunctuation;
+ private readonly bool searchMode;
+ private readonly bool extendedMode;
+ private readonly bool outputCompounds;
+
+ // Index of the last character of unknown word:
+ private int unknownWordEndIndex = -1;
+
+ // True once we've hit the EOF from the input reader:
+ private bool end;
+
+ // Last absolute position we backtraced from:
+ private int lastBackTracePos;
+
+ // Position of last token we returned; we use this to
+ // figure out whether to set posIncr to 0 or 1:
+ private int lastTokenPos;
+
+ // Next absolute position to process:
+ private int pos;
+
+ // Already parsed, but not yet passed to caller, tokens:
+ private readonly IList<Token> pending = new List<Token>();
+
+ private readonly ICharTermAttribute termAtt;
+ private readonly IOffsetAttribute offsetAtt;
+ private readonly IPositionIncrementAttribute posIncAtt;
+ private readonly IPositionLengthAttribute posLengthAtt;
+ private readonly IBaseFormAttribute basicFormAtt;
+ private readonly IPartOfSpeechAttribute posAtt;
+ private readonly IReadingAttribute readingAtt;
+ private readonly IInflectionAttribute inflectionAtt;
+
+ /// <summary>
+ /// Create a new JapaneseTokenizer.
+ /// <para/>
+ /// Uses the default AttributeFactory.
+ /// </summary>
+ /// <param name="input">TextReader containing text.</param>
+ /// <param name="userDictionary">Optional: if non-null, user dictionary.</param>
+ /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param>
+ /// <param name="mode">Tokenization mode.</param>
+ public JapaneseTokenizer(TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode)
+ : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode)
+ {
+ }
+
+ /// <summary>
+ /// Create a new JapaneseTokenizer.
+ /// </summary>
+ /// <param name="factory">The AttributeFactory to use.</param>
+ /// <param name="input">TextReader containing text.</param>
+ /// <param name="userDictionary">Optional: if non-null, user dictionary.</param>
+ /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param>
+ /// <param name="mode">Tokenization mode.</param>
+ public JapaneseTokenizer
+ (AttributeFactory factory, TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode)
+ : base(factory, input)
+ {
+ this.termAtt = AddAttribute<ICharTermAttribute>();
+ this.offsetAtt = AddAttribute<IOffsetAttribute>();
+ this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+ this.posLengthAtt = AddAttribute<IPositionLengthAttribute>();
+ this.basicFormAtt = AddAttribute<IBaseFormAttribute>();
+ this.posAtt = AddAttribute<IPartOfSpeechAttribute>();
+ this.readingAtt = AddAttribute<IReadingAttribute>();
+ this.inflectionAtt = AddAttribute<IInflectionAttribute>();
+
+ dictionary = TokenInfoDictionary.GetInstance();
+ fst = dictionary.FST;
+ unkDictionary = UnknownDictionary.GetInstance();
+ characterDefinition = unkDictionary.CharacterDefinition;
+ this.userDictionary = userDictionary;
+ costs = ConnectionCosts.GetInstance();
+ fstReader = fst.GetBytesReader();
+ if (userDictionary != null)
+ {
+ userFST = userDictionary.FST;
+ userFSTReader = userFST.GetBytesReader();
+ }
+ else
+ {
+ userFST = null;
+ userFSTReader = null;
+ }
+ this.discardPunctuation = discardPunctuation;
+ switch (mode)
+ {
+ case JapaneseTokenizerMode.SEARCH:
+ searchMode = true;
+ extendedMode = false;
+ outputCompounds = true;
+ break;
+ case JapaneseTokenizerMode.EXTENDED:
+ searchMode = true;
+ extendedMode = true;
+ outputCompounds = false;
+ break;
+ default:
+ searchMode = false;
+ extendedMode = false;
+ outputCompounds = false;
+ break;
+ }
+ buffer.Reset(this.m_input);
+
+ ResetState();
+
+ dictionaryMap[JapaneseTokenizerType.KNOWN] = dictionary;
+ dictionaryMap[JapaneseTokenizerType.UNKNOWN] = unkDictionary;
+ dictionaryMap[JapaneseTokenizerType.USER] = userDictionary;
+ }
+
+ private GraphvizFormatter dotOut;
+
+ // LUCENENET specific - added getter and made into property
+ // so we can set this during object initialization.
+
+ /// <summary>
+ /// Expert: set this to produce graphviz (dot) output of
+ /// the Viterbi lattice
+ /// </summary>
+ public GraphvizFormatter GraphvizFormatter
+ {
+ get { return this.dotOut; }
+ set { this.dotOut = value; }
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ base.Dispose(disposing);
+ if (disposing)
+ {
+ buffer.Reset(m_input);
+ }
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ buffer.Reset(m_input);
+ ResetState();
+ }
+
+ private void ResetState()
+ {
+ positions.Reset();
+ unknownWordEndIndex = -1;
+ pos = 0;
+ end = false;
+ lastBackTracePos = 0;
+ lastTokenPos = -1;
+ pending.Clear();
+
+ // Add BOS:
+ positions.Get(0).Add(0, 0, -1, -1, -1, JapaneseTokenizerType.KNOWN);
+ }
+
+ public override void End()
+ {
+ base.End();
+ // Set final offset
+ int finalOffset = CorrectOffset(pos);
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ // Returns the added cost that a 2nd best segmentation is
+ // allowed to have. Ie, if we see path with cost X,
+ // ending in a compound word, and this method returns
+ // threshold > 0, then we will also find the 2nd best
+ // segmentation and if its path score is within this
+ // threshold of X, we'll include it in the output:
+ private int ComputeSecondBestThreshold(int pos, int length)
+ {
+ // TODO: maybe we do something else here, instead of just
+ // using the penalty...? EG we can be more aggressive on
+ // when to also test for 2nd best path
+ return ComputePenalty(pos, length);
+ }
+
+ private int ComputePenalty(int pos, int length)
+ {
+ if (length > SEARCH_MODE_KANJI_LENGTH)
+ {
+ bool allKanji = true;
+ // check if node consists of only kanji
+ int endPos = pos + length;
+ for (int pos2 = pos; pos2 < endPos; pos2++)
+ {
+ if (!characterDefinition.IsKanji((char)buffer.Get(pos2)))
+ {
+ allKanji = false;
+ break;
+ }
+ }
+ if (allKanji)
+ { // Process only Kanji keywords
+ return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
+ }
+ else if (length > SEARCH_MODE_OTHER_LENGTH)
+ {
+ return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
+ }
+ }
+ return 0;
+ }
+
+ // LUCENENET specific - de-nested Position class
+
+ private void Add(IDictionary dict, Position fromPosData, int endPos, int wordID, JapaneseTokenizerType type, bool addPenalty)
+ {
+ int wordCost = dict.GetWordCost(wordID);
+ int leftID = dict.GetLeftId(wordID);
+ int leastCost = int.MaxValue;
+ int leastIDX = -1;
+ Debug.Assert(fromPosData.count > 0);
+ for (int idx = 0; idx < fromPosData.count; idx++)
+ {
+ // Cost is path cost so far, plus word cost (added at
+ // end of loop), plus bigram cost:
+ int cost = fromPosData.costs[idx] + costs.Get(fromPosData.lastRightID[idx], leftID);
+ if (VERBOSE)
+ {
+ Console.WriteLine(" fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.Get(fromPosData.lastRightID[idx], leftID) + " leftID=" + leftID);
+ }
+ if (cost < leastCost)
+ {
+ leastCost = cost;
+ leastIDX = idx;
+ if (VERBOSE)
+ {
+ Console.WriteLine(" **");
+ }
+ }
+ }
+
+ leastCost += wordCost;
+
+ if (VERBOSE)
+ {
+ Console.WriteLine(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.Get(endPos).count);
+ }
+
+ if ((addPenalty || (!outputCompounds && searchMode)) && type != JapaneseTokenizerType.USER)
+ {
+ int penalty = ComputePenalty(fromPosData.pos, endPos - fromPosData.pos);
+ if (VERBOSE)
+ {
+ if (penalty > 0)
+ {
+ Console.WriteLine(" + penalty=" + penalty + " cost=" + (leastCost + penalty));
+ }
+ }
+ leastCost += penalty;
+ }
+
+ //positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, wordID, type);
+ Debug.Assert(leftID == dict.GetRightId(wordID));
+ positions.Get(endPos).Add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type);
+ }
+
+ public override bool IncrementToken()
+ {
+
+ // parse() is able to return w/o producing any new
+ // tokens, when the tokens it had produced were entirely
+ // punctuation. So we loop here until we get a real
+ // token or we end:
+ while (pending.Count == 0)
+ {
+ if (end)
+ {
+ return false;
+ }
+
+ // Push Viterbi forward some more:
+ Parse();
+ }
+
+ Token token = pending.LastOrDefault();
+ if (token != null)
+ {
+ pending.Remove(token);
+ }
+
+ int position = token.Position;
+ int length = token.Length;
+ ClearAttributes();
+ Debug.Assert(length > 0);
+ //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length);
+ termAtt.CopyBuffer(token.SurfaceForm, token.Offset, length);
+ offsetAtt.SetOffset(CorrectOffset(position), CorrectOffset(position + length));
+ basicFormAtt.SetToken(token);
+ posAtt.SetToken(token);
+ readingAtt.SetToken(token);
+ inflectionAtt.SetToken(token);
+ if (token.Position == lastTokenPos)
+ {
+ posIncAtt.PositionIncrement = 0;
+ posLengthAtt.PositionLength = token.PositionLength;
+ }
+ else
+ {
+ Debug.Assert(token.Position > lastTokenPos);
+ posIncAtt.PositionIncrement = 1;
+ posLengthAtt.PositionLength = 1;
+ }
+ if (VERBOSE)
+ {
+ Console.WriteLine(Thread.CurrentThread.Name + ": incToken: return token=" + token);
+ }
+ lastTokenPos = token.Position;
+ return true;
+ }
+
+ /// <summary>
+ /// Incrementally parse some more characters. This runs
+ /// the viterbi search forwards "enough" so that we
+ /// generate some more tokens. How much forward depends on
+ /// the chars coming in, since some chars could cause
+ /// longer-lasting ambiguity in the parsing. Once the
+ /// ambiguity is resolved, then we back trace, produce
+ /// the pending tokens, and return.
+ /// </summary>
+ private void Parse()
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine("\nPARSE");
+ }
+
+ // Advances over each position (character):
+ while (true)
+ {
+
+ if (buffer.Get(pos) == -1)
+ {
+ // End
+ break;
+ }
+
+ Position posData = positions.Get(pos);
+ bool isFrontier = positions.GetNextPos() == pos + 1;
+
+ if (posData.count == 0)
+ {
+ // No arcs arrive here; move to next position:
+ if (VERBOSE)
+ {
+ Console.WriteLine(" no arcs in; skip pos=" + pos);
+ }
+ pos++;
+ continue;
+ }
+
+ if (pos > lastBackTracePos && posData.count == 1 && isFrontier)
+ {
+ // if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
+ // We are at a "frontier", and only one node is
+ // alive, so whatever the eventual best path is must
+ // come through this node. So we can safely commit
+ // to the prefix of the best path at this point:
+ Backtrace(posData, 0);
+
+ // Re-base cost so we don't risk int overflow:
+ posData.costs[0] = 0;
+
+ if (pending.Count != 0)
+ {
+ return;
+ }
+ else
+ {
+ // This means the backtrace only produced
+ // punctuation tokens, so we must keep parsing.
+ }
+ }
+
+ if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP)
+ {
+ // Safety: if we've buffered too much, force a
+ // backtrace now. We find the least-cost partial
+ // path, across all paths, backtrace from it, and
+ // then prune all others. Note that this, in
+ // general, can produce the wrong result, if the
+ // total best path did not in fact back trace
+ // through this partial best path. But it's the
+ // best we can do... (short of not having a
+ // safety!).
+
+ // First pass: find least cost partial path so far,
+ // including ending at future positions:
+ int leastIDX = -1;
+ int leastCost = int.MaxValue;
+ Position leastPosData = null;
+ for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
+ {
+ Position posData2 = positions.Get(pos2);
+ for (int idx = 0; idx < posData2.count; idx++)
+ {
+ //System.out.println(" idx=" + idx + " cost=" + cost);
+ int cost = posData2.costs[idx];
+ if (cost < leastCost)
+ {
+ leastCost = cost;
+ leastIDX = idx;
+ leastPosData = posData2;
+ }
+ }
+ }
+
+ // We will always have at least one live path:
+ Debug.Assert(leastIDX != -1);
+
+ // Second pass: prune all but the best path:
+ for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
+ {
+ Position posData2 = positions.Get(pos2);
+ if (posData2 != leastPosData)
+ {
+ posData2.Reset();
+ }
+ else
+ {
+ if (leastIDX != 0)
+ {
+ posData2.costs[0] = posData2.costs[leastIDX];
+ posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
+ posData2.backPos[0] = posData2.backPos[leastIDX];
+ posData2.backIndex[0] = posData2.backIndex[leastIDX];
+ posData2.backID[0] = posData2.backID[leastIDX];
+ posData2.backType[0] = posData2.backType[leastIDX];
+ }
+ posData2.count = 1;
+ }
+ }
+
+ Backtrace(leastPosData, 0);
+
+ // Re-base cost so we don't risk int overflow:
+ Arrays.Fill(leastPosData.costs, 0, leastPosData.count, 0);
+
+ if (pos != leastPosData.pos)
+ {
+ // We jumped into a future position:
+ Debug.Assert(pos < leastPosData.pos);
+ pos = leastPosData.pos;
+ }
+
+ if (pending.Count != 0)
+ {
+ return;
+ }
+ else
+ {
+ // This means the backtrace only produced
+ // punctuation tokens, so we must keep parsing.
+ continue;
+ }
+ }
+
+ if (VERBOSE)
+ {
+ Console.WriteLine("\n extend @ pos=" + pos + " char=" + (char)buffer.Get(pos));
+ }
+
+ if (VERBOSE)
+ {
+ Console.WriteLine(" " + posData.count + " arcs in");
+ }
+
+ bool anyMatches = false;
+
+ // First try user dict:
+ if (userFST != null)
+ {
+ userFST.GetFirstArc(arc);
+ int output = 0;
+ for (int posAhead = posData.pos; ; posAhead++)
+ {
+ int ch = buffer.Get(posAhead);
+ if (ch == -1)
+ {
+ break;
+ }
+ if (userFST.FindTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null)
+ {
+ break;
+ }
+ output += (int)arc.Output;
+ if (arc.IsFinal)
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine(" USER word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
+ }
+ Add(userDictionary, posData, posAhead + 1, output + (int)arc.NextFinalOutput, JapaneseTokenizerType.USER, false);
+ anyMatches = true;
+ }
+ }
+ }
+
+ // TODO: we can be more aggressive about user
+ // matches? if we are "under" a user match then don't
+ // extend KNOWN/UNKNOWN paths?
+
+ if (!anyMatches)
+ {
+ // Next, try known dictionary matches
+ fst.GetFirstArc(arc);
+ int output = 0;
+
+ for (int posAhead = posData.pos; ; posAhead++)
+ {
+ int ch = buffer.Get(posAhead);
+ if (ch == -1)
+ {
+ break;
+ }
+ //System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
+
+ if (fst.FindTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null)
+ {
+ break;
+ }
+
+ output += (int)arc.Output;
+
+ // Optimization: for known words that are too-long
+ // (compound), we should pre-compute the 2nd
+ // best segmentation and store it in the
+ // dictionary instead of recomputing it each time a
+ // match is found.
+
+ if (arc.IsFinal)
+ {
+ dictionary.LookupWordIds(output + (int)arc.NextFinalOutput, wordIdRef);
+ if (VERBOSE)
+ {
+ Console.WriteLine(" KNOWN word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.Length + " wordIDs");
+ }
+ for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
+ {
+ Add(dictionary, posData, posAhead + 1, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.KNOWN, false);
+ anyMatches = true;
+ }
+ }
+ }
+ }
+
+ // In the case of normal mode, it doesn't process unknown word greedily.
+
+ if (!searchMode && unknownWordEndIndex > posData.pos)
+ {
+ pos++;
+ continue;
+ }
+
+ char firstCharacter = (char)buffer.Get(pos);
+ if (!anyMatches || characterDefinition.IsInvoke(firstCharacter))
+ {
+
+ // Find unknown match:
+ int characterId = characterDefinition.GetCharacterClass(firstCharacter);
+ bool isPunct = IsPunctuation(firstCharacter);
+
+ // NOTE: copied from UnknownDictionary.lookup:
+ int unknownWordLength;
+ if (!characterDefinition.IsGroup(firstCharacter))
+ {
+ unknownWordLength = 1;
+ }
+ else
+ {
+ // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+ unknownWordLength = 1;
+ for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++)
+ {
+ int ch = buffer.Get(posAhead);
+ if (ch == -1)
+ {
+ break;
+ }
+ if (characterId == characterDefinition.GetCharacterClass((char)ch) &&
+ IsPunctuation((char)ch) == isPunct)
+ {
+ unknownWordLength++;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+
+ unkDictionary.LookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
+ if (VERBOSE)
+ {
+ Console.WriteLine(" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.Length + " wordIDs");
+ }
+ for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
+ {
+ Add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.UNKNOWN, false);
+ }
+
+ unknownWordEndIndex = posData.pos + unknownWordLength;
+ }
+
+ pos++;
+ }
+
+ end = true;
+
+ if (pos > 0)
+ {
+
+ Position endPosData = positions.Get(pos);
+ int leastCost = int.MaxValue;
+ int leastIDX = -1;
+ if (VERBOSE)
+ {
+ Console.WriteLine(" end: " + endPosData.count + " nodes");
+ }
+ for (int idx = 0; idx < endPosData.count; idx++)
+ {
+ // Add EOS cost:
+ int cost = endPosData.costs[idx] + costs.Get(endPosData.lastRightID[idx], 0);
+ //System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
+ if (cost < leastCost)
+ {
+ leastCost = cost;
+ leastIDX = idx;
+ }
+ }
+
+ Backtrace(endPosData, leastIDX);
+ }
+ else
+ {
+ // No characters in the input string; return no tokens!
+ }
+ }
+
+ // Eliminates arcs from the lattice that are compound
+ // tokens (have a penalty) or are not congruent with the
+ // compound token we've matched (ie, span across the
+ // startPos). This should be fairly efficient, because we
+ // just keep the already intersected structure of the
+ // graph, eg we don't have to consult the FSTs again:
+
+ private void PruneAndRescore(int startPos, int endPos, int bestStartIDX)
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine(" pruneAndRescore startPos=" + startPos + " endPos=" + endPos + " bestStartIDX=" + bestStartIDX);
+ }
+
+ // First pass: walk backwards, building up the forward
+ // arcs and pruning inadmissible arcs:
+ for (int pos = endPos; pos > startPos; pos--)
+ {
+ Position posData = positions.Get(pos);
+ if (VERBOSE)
+ {
+ Console.WriteLine(" back pos=" + pos);
+ }
+ for (int arcIDX = 0; arcIDX < posData.count; arcIDX++)
+ {
+ int backPos = posData.backPos[arcIDX];
+ if (backPos >= startPos)
+ {
+ // Keep this arc:
+ //System.out.println(" keep backPos=" + backPos);
+ positions.Get(backPos).AddForward(pos,
+ arcIDX,
+ posData.backID[arcIDX],
+ posData.backType[arcIDX]);
+ }
+ else
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine(" prune");
+ }
+ }
+ }
+ if (pos != startPos)
+ {
+ posData.count = 0;
+ }
+ }
+
+ // Second pass: walk forward, re-scoring:
+ for (int pos = startPos; pos < endPos; pos++)
+ {
+ Position posData = positions.Get(pos);
+ if (VERBOSE)
+ {
+ Console.WriteLine(" forward pos=" + pos + " count=" + posData.forwardCount);
+ }
+ if (posData.count == 0)
+ {
+ // No arcs arrive here...
+ if (VERBOSE)
+ {
+ Console.WriteLine(" skip");
+ }
+ posData.forwardCount = 0;
+ continue;
+ }
+
+ if (pos == startPos)
+ {
+ // On the initial position, only consider the best
+ // path so we "force congruence": the
+ // sub-segmentation is "in context" of what the best
+ // path (compound token) had matched:
+ int rightID;
+ if (startPos == 0)
+ {
+ rightID = 0;
+ }
+ else
+ {
+ rightID = GetDict(posData.backType[bestStartIDX]).GetRightId(posData.backID[bestStartIDX]);
+ }
+ int pathCost = posData.costs[bestStartIDX];
+ for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++)
+ {
+ JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX];
+ IDictionary dict2 = GetDict(forwardType);
+ int wordID = posData.forwardID[forwardArcIDX];
+ int toPos = posData.forwardPos[forwardArcIDX];
+ int newCost = pathCost + dict2.GetWordCost(wordID) +
+ costs.Get(rightID, dict2.GetLeftId(wordID)) +
+ ComputePenalty(pos, toPos - pos);
+ if (VERBOSE)
+ {
+ Console.WriteLine(" + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + ComputePenalty(pos, toPos - pos) + " toPos.idx=" + positions.Get(toPos).count);
+ }
+ positions.Get(toPos).Add(newCost,
+ dict2.GetRightId(wordID),
+ pos,
+ bestStartIDX,
+ wordID,
+ forwardType);
+ }
+ }
+ else
+ {
+ // On non-initial positions, we maximize score
+ // across all arriving lastRightIDs:
+ for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++)
+ {
+ JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX];
+ int toPos = posData.forwardPos[forwardArcIDX];
+ if (VERBOSE)
+ {
+ Console.WriteLine(" + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos);
+ }
+ Add(GetDict(forwardType),
+ posData,
+ toPos,
+ posData.forwardID[forwardArcIDX],
+ forwardType,
+ true);
+ }
+ }
+ posData.forwardCount = 0;
+ }
+ }
+
+ // Backtrace from the provided position, back to the last
+ // time we back-traced, accumulating the resulting tokens to
+ // the pending list. The pending list is then in-reverse
+ // (last token should be returned first).
+ private void Backtrace(Position endPosData, int fromIDX)
+ {
+ int endPos = endPosData.pos;
+
+ if (VERBOSE)
+ {
+ Console.WriteLine("\n backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
+ }
+
+ char[] fragment = buffer.Get(lastBackTracePos, endPos - lastBackTracePos);
+
+ if (dotOut != null)
+ {
+ dotOut.OnBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
+ }
+
+ int pos = endPos;
+ int bestIDX = fromIDX;
+ Token altToken = null;
+
+ // We trace backwards, so this will be the leftWordID of
+ // the token after the one we are now on:
+ int lastLeftWordID = -1;
+
+ int backCount = 0;
+
+ // TODO: sort of silly to make Token instances here; the
+ // back trace has all info needed to generate the
+ // token. So, we could just directly set the attrs,
+ // from the backtrace, in incrementToken w/o ever
+ // creating Token; we'd have to defer calling freeBefore
+ // until after the backtrace was fully "consumed" by
+ // incrementToken.
+
+ while (pos > lastBackTracePos)
+ {
+ //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
+ Position posData = positions.Get(pos);
+ Debug.Assert(bestIDX < posData.count);
+
+ int backPos = posData.backPos[bestIDX];
+ Debug.Assert(backPos >= lastBackTracePos, "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos);
+ int length = pos - backPos;
+ JapaneseTokenizerType backType = posData.backType[bestIDX];
+ int backID = posData.backID[bestIDX];
+ int nextBestIDX = posData.backIndex[bestIDX];
+
+ if (outputCompounds && searchMode && altToken == null && backType != JapaneseTokenizerType.USER)
+ {
+
+ // In searchMode, if best path had picked a too-long
+ // token, we use the "penalty" to compute the allowed
+ // max cost of an alternate back-trace. If we find an
+ // alternate back trace with cost below that
+ // threshold, we pursue it instead (but also output
+ // the long token).
+ //System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos);
+
+ int penalty = ComputeSecondBestThreshold(backPos, pos - backPos);
+
+ if (penalty > 0)
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine(" compound=" + new string(buffer.Get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
+ }
+
+ // Use the penalty to set maxCost on the 2nd best
+ // segmentation:
+ int maxCost = posData.costs[bestIDX] + penalty;
+ if (lastLeftWordID != -1)
+ {
+ maxCost += costs.Get(GetDict(backType).GetRightId(backID), lastLeftWordID);
+ }
+
+ // Now, prune all too-long tokens from the graph:
+ PruneAndRescore(backPos, pos,
+ posData.backIndex[bestIDX]);
+
+ // Finally, find 2nd best back-trace and resume
+ // backtrace there:
+ int leastCost = int.MaxValue;
+ int leastIDX = -1;
+ for (int idx = 0; idx < posData.count; idx++)
+ {
+ int cost = posData.costs[idx];
+ //System.out.println(" idx=" + idx + " prevCost=" + cost);
+
+ if (lastLeftWordID != -1)
+ {
+ cost += costs.Get(GetDict(posData.backType[idx]).GetRightId(posData.backID[idx]),
+ lastLeftWordID);
+ //System.out.println(" += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
+ //lastLeftWordID) + " -> " + cost);
+ }
+ //System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
+ //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
+ if (cost < leastCost)
+ {
+ //System.out.println(" ** ");
+ leastCost = cost;
+ leastIDX = idx;
+ }
+ }
+ //System.out.println(" leastIDX=" + leastIDX);
+
+ if (VERBOSE)
+ {
+ Console.WriteLine(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
+ }
+
+ if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos)
+ {
+ // We should have pruned the altToken from the graph:
+ Debug.Assert(posData.backPos[leastIDX] != backPos);
+
+ // Save the current compound token, to output when
+ // this alternate path joins back:
+ altToken = new Token(backID,
+ fragment,
+ backPos - lastBackTracePos,
+ length,
+ backType,
+ backPos,
+ GetDict(backType));
+
+ // Redirect our backtrace to 2nd best:
+ bestIDX = leastIDX;
+ nextBestIDX = posData.backIndex[bestIDX];
+
+ backPos = posData.backPos[bestIDX];
+ length = pos - backPos;
+ backType = posData.backType[bestIDX];
+ backID = posData.backID[bestIDX];
+ backCount = 0;
+ //System.out.println(" do alt token!");
+
+ }
+ else
+ {
+ // I think in theory it's possible there is no
+ // 2nd best path, which is fine; in this case we
+ // only output the compound token:
+ //System.out.println(" no alt token! bestIDX=" + bestIDX);
+ }
+ }
+ }
+
+ int offset = backPos - lastBackTracePos;
+ Debug.Assert(offset >= 0);
+
+ if (altToken != null && altToken.Position >= backPos)
+ {
+
+ // We've backtraced to the position where the
+ // compound token starts; add it now:
+
+ // The pruning we did when we created the altToken
+ // ensures that the back trace will align back with
+ // the start of the altToken:
+ Debug.Assert(altToken.Position == backPos, altToken.Position + " vs " + backPos);
+
+ // NOTE: not quite right: the compound token may
+ // have had all punctuation back traced so far, but
+ // then the decompounded token at this position is
+ // not punctuation. In this case backCount is 0,
+ // but we should maybe add the altToken anyway...?
+
+ if (backCount > 0)
+ {
+ backCount++;
+ altToken.PositionLength = backCount;
+ if (VERBOSE)
+ {
+ Console.WriteLine(" add altToken=" + altToken);
+ }
+ pending.Add(altToken);
+ }
+ else
+ {
+ // This means alt token was all punct tokens:
+ if (VERBOSE)
+ {
+ Console.WriteLine(" discard all-punctuation altToken=" + altToken);
+ }
+ Debug.Assert(discardPunctuation);
+ }
+ altToken = null;
+ }
+
+ IDictionary dict = GetDict(backType);
+
+ if (backType == JapaneseTokenizerType.USER)
+ {
+
+ // Expand the phraseID we recorded into the actual
+ // segmentation:
+ int[] wordIDAndLength = userDictionary.LookupSegmentation(backID);
+ int wordID = wordIDAndLength[0];
+ int current = 0;
+ for (int j = 1; j < wordIDAndLength.Length; j++)
+ {
+ int len = wordIDAndLength[j];
+ //System.out.println(" add user: len=" + len);
+ pending.Add(new Token(wordID + j - 1,
+ fragment,
+ current + offset,
+ len,
+ JapaneseTokenizerType.USER,
+ current + backPos,
+ dict));
+ if (VERBOSE)
+ {
+ Console.WriteLine(" add USER token=" + pending[pending.Count - 1]);
+ }
+ current += len;
+ }
+
+ // Reverse the tokens we just added, because when we
+ // serve them up from incrementToken we serve in
+ // reverse:
+ Collections.Reverse(pending.SubList(pending.Count - (wordIDAndLength.Length - 1),
+ pending.Count));
+
+ backCount += wordIDAndLength.Length - 1;
+ }
+ else
+ {
+
+ if (extendedMode && backType == JapaneseTokenizerType.UNKNOWN)
+ {
+ // In EXTENDED mode we convert unknown word into
+ // unigrams:
+ int unigramTokenCount = 0;
+ for (int i = length - 1; i >= 0; i--)
+ {
+ int charLen = 1;
+ if (i > 0 && char.IsLowSurrogate(fragment[offset + i]))
+ {
+ i--;
+ charLen = 2;
+ }
+ //System.out.println(" extended tok offset="
+ //+ (offset + i));
+ if (!discardPunctuation || !IsPunctuation(fragment[offset + i]))
+ {
+ pending.Add(new Token(CharacterDefinition.NGRAM,
+ fragment,
+ offset + i,
+ charLen,
+ JapaneseTokenizerType.UNKNOWN,
+ backPos + i,
+ unkDictionary));
+ unigramTokenCount++;
+ }
+ }
+ backCount += unigramTokenCount;
+
+ }
+ else if (!discardPunctuation || length == 0 || !IsPunctuation(fragment[offset]))
+ {
+ pending.Add(new Token(backID,
+ fragment,
+ offset,
+ length,
+ backType,
+ backPos,
+ dict));
+ if (VERBOSE)
+ {
+ Console.WriteLine(" add token=" + pending[pending.Count - 1]);
+ }
+ backCount++;
+ }
+ else
+ {
+ if (VERBOSE)
+ {
+ Console.WriteLine(" skip punctuation token=" + new string(fragment, offset, length));
+ }
+ }
+ }
+
+ lastLeftWordID = dict.GetLeftId(backID);
+ pos = backPos;
+ bestIDX = nextBestIDX;
+ }
+
+ lastBackTracePos = endPos;
+
+ if (VERBOSE)
+ {
+ Console.WriteLine(" freeBefore pos=" + endPos);
+ }
+ // Notify the circular buffers that we are done with
+ // these positions:
+ buffer.FreeBefore(endPos);
+ positions.FreeBefore(endPos);
+ }
+
+ internal IDictionary GetDict(JapaneseTokenizerType type)
+ {
+ IDictionary result;
+ dictionaryMap.TryGetValue(type, out result);
+ return result;
+ }
+
+ private static bool IsPunctuation(char ch)
+ {
+ switch (Character.GetType(ch))
+ {
+ case UnicodeCategory.SpaceSeparator:
+ case UnicodeCategory.LineSeparator:
+ case UnicodeCategory.ParagraphSeparator:
+ case UnicodeCategory.Control:
+ case UnicodeCategory.Format:
+ case UnicodeCategory.DashPunctuation:
+ case UnicodeCategory.OpenPunctuation:
+ case UnicodeCategory.ClosePunctuation:
+ case UnicodeCategory.ConnectorPunctuation:
+ case UnicodeCategory.OtherPunctuation:
+ case UnicodeCategory.MathSymbol:
+ case UnicodeCategory.CurrencySymbol:
+ case UnicodeCategory.ModifierSymbol:
+ case UnicodeCategory.OtherSymbol:
+ case UnicodeCategory.InitialQuotePunctuation:
+ case UnicodeCategory.FinalQuotePunctuation:
+ return true;
+ default:
+ return false;
+ }
+ }
+ }
+
+ // LUCENENET specific - de-nested Mode and renamed JapaneseTokenizerMode
+
+ /// <summary>
+ /// Tokenization mode: this determines how the tokenizer handles
+ /// compound and unknown words.
+ /// </summary>
+ public enum JapaneseTokenizerMode
+ {
+ /// <summary>
+ /// Ordinary segmentation: no decomposition for compounds,
+ /// </summary>
+ NORMAL,
+
+ /// <summary>
+ /// Segmentation geared towards search: this includes a
+ /// decompounding process for long nouns, also including
+ /// the full compound token as a synonym.
+ /// </summary>
+ SEARCH,
+
+ /// <summary>
+ /// Extended mode outputs unigrams for unknown words.
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ EXTENDED
+ }
+
+ // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType
+
+ /// <summary>
+ /// Token type reflecting the original source of this token
+ /// </summary>
+ public enum JapaneseTokenizerType
+ {
+ /// <summary>
+ /// Known words from the system dictionary.
+ /// </summary>
+ KNOWN,
+ /// <summary>
+ /// Unknown words (heuristically segmented).
+ /// </summary>
+ UNKNOWN,
+ /// <summary>
+ /// Known words from the user dictionary.
+ /// </summary>
+ USER
+ }
+
+
+ // LUCENENET specific - De-nested Position
+
+ // Holds all back pointers arriving to this position:
+ internal sealed class Position
+ {
+
+ internal int pos;
+
+ internal int count;
+
+ // maybe single int array * 5?
+ internal int[] costs = new int[8];
+ internal int[] lastRightID = new int[8];
+ internal int[] backPos = new int[8];
+ internal int[] backIndex = new int[8];
+ internal int[] backID = new int[8];
+ internal JapaneseTokenizerType[] backType = new JapaneseTokenizerType[8];
+
+ // Only used when finding 2nd best segmentation under a
+ // too-long token:
+ internal int forwardCount;
+ internal int[] forwardPos = new int[8];
+ internal int[] forwardID = new int[8];
+ internal int[] forwardIndex = new int[8];
+ internal JapaneseTokenizerType[] forwardType = new JapaneseTokenizerType[8];
+
+ public void Grow()
+ {
+ costs = ArrayUtil.Grow(costs, 1 + count);
+ lastRightID = ArrayUtil.Grow(lastRightID, 1 + count);
+ backPos = ArrayUtil.Grow(backPos, 1 + count);
+ backIndex = ArrayUtil.Grow(backIndex, 1 + count);
+ backID = ArrayUtil.Grow(backID, 1 + count);
+
+ // NOTE: sneaky: grow separately because
+ // ArrayUtil.grow will otherwise pick a different
+ // length than the int[]s we just grew:
+ JapaneseTokenizerType[] newBackType = new JapaneseTokenizerType[backID.Length];
+ System.Array.Copy(backType, 0, newBackType, 0, backType.Length);
+ backType = newBackType;
+ }
+
+ public void GrowForward()
+ {
+ forwardPos = ArrayUtil.Grow(forwardPos, 1 + forwardCount);
+ forwardID = ArrayUtil.Grow(forwardID, 1 + forwardCount);
+ forwardIndex = ArrayUtil.Grow(forwardIndex, 1 + forwardCount);
+
+ // NOTE: sneaky: grow separately because
+ // ArrayUtil.grow will otherwise pick a different
+ // length than the int[]s we just grew:
+ JapaneseTokenizerType[] newForwardType = new JapaneseTokenizerType[forwardPos.Length];
+ System.Array.Copy(forwardType, 0, newForwardType, 0, forwardType.Length);
+ forwardType = newForwardType;
+ }
+
+ public void Add(int cost, int lastRightID, int backPos, int backIndex, int backID, JapaneseTokenizerType backType)
+ {
+ // NOTE: this isn't quite a true Viterbi search,
+ // because we should check if lastRightID is
+ // already present here, and only update if the new
+ // cost is less than the current cost, instead of
+ // simply appending. However, that will likely hurt
+ // performance (usually we add a lastRightID only once),
+ // and it means we actually create the full graph
+ // intersection instead of a "normal" Viterbi lattice:
+ if (count == costs.Length)
+ {
+ Grow();
+ }
+ this.costs[count] = cost;
+ this.lastRightID[count] = lastRightID;
+ this.backPos[count] = backPos;
+ this.backIndex[count] = backIndex;
+ this.backID[count] = backID;
+ this.backType[count] = backType;
+ count++;
+ }
+
+ public void AddForward(int forwardPos, int forwardIndex, int forwardID, JapaneseTokenizerType forwardType)
+ {
+ if (forwardCount == this.forwardID.Length)
+ {
+ GrowForward();
+ }
+ this.forwardPos[forwardCount] = forwardPos;
+ this.forwardIndex[forwardCount] = forwardIndex;
+ this.forwardID[forwardCount] = forwardID;
+ this.forwardType[forwardCount] = forwardType;
+ forwardCount++;
+ }
+
+ public void Reset()
+ {
+ count = 0;
+ // forwardCount naturally resets after it runs:
+ Debug.Assert(forwardCount == 0, "pos=" + pos + " forwardCount=" + forwardCount);
+ }
+ }
+
+
+ // LUCENENET specific - de-nested WrappedPositionArray
+
+ // TODO: make generic'd version of this "circular array"?
+ // It's a bit tricky because we do things to the Position
+ // (eg, set .pos = N on reuse)...
+ internal sealed class WrappedPositionArray
+ {
+ private Position[] positions = new Position[8];
+
+ public WrappedPositionArray()
+ {
+ for (int i = 0; i < positions.Length; i++)
+ {
+ positions[i] = new Position();
+ }
+ }
+
+ // Next array index to write to in positions:
+ private int nextWrite;
+
+ // Next position to write:
+ private int nextPos;
+
+ // How many valid Position instances are held in the
+ // positions array:
+ private int count;
+
+ public void Reset()
+ {
+ nextWrite--;
+ while (count > 0)
+ {
+ if (nextWrite == -1)
+ {
+ nextWrite = positions.Length - 1;
+ }
+ positions[nextWrite--].Reset();
+ count--;
+ }
+ nextWrite = 0;
+ nextPos = 0;
+ count = 0;
+ }
+
+ /// <summary>
+ /// Get Position instance for this absolute position;
+ /// this is allowed to be arbitrarily far "in the
+ /// future" but cannot be before the last freeBefore.
+ /// </summary>
+ public Position Get(int pos)
+ {
+ while (pos >= nextPos)
+ {
+ //System.out.println("count=" + count + " vs len=" + positions.length);
+ if (count == positions.Length)
+ {
+ Position[] newPositions = new Position[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ //System.out.println("grow positions " + newPositions.length);
+ System.Array.Copy(positions, nextWrite, newPositions, 0, positions.Length - nextWrite);
+ System.Array.Copy(positions, 0, newPositions, positions.Length - nextWrite, nextWrite);
+ for (int i = positions.Length; i < newPositions.Length; i++)
+ {
+ newPositions[i] = new Position();
+ }
+ nextWrite = positions.Length;
+ positions = newPositions;
+ }
+ if (nextWrite == positions.Length)
+ {
+ nextWrite = 0;
+ }
+ // Should have already been reset:
+ Debug.Assert(positions[nextWrite].count == 0);
+ positions[nextWrite++].pos = nextPos++;
+ count++;
+ }
+ Debug.Assert(InBounds(pos));
+ int index = GetIndex(pos);
+ Debug.Assert(positions[index].pos == pos);
+ return positions[index];
+ }
+
+ public int GetNextPos()
+ {
+ return nextPos;
+ }
+
+ // For assert:
+ private bool InBounds(int pos)
+ {
+ return pos < nextPos && pos >= nextPos - count;
+ }
+
+ private int GetIndex(int pos)
+ {
+ int index = nextWrite - (nextPos - pos);
+ if (index < 0)
+ {
+ index += positions.Length;
+ }
+ return index;
+ }
+
+ public void FreeBefore(int pos)
+ {
+ int toFree = count - (nextPos - pos);
+ Debug.Assert(toFree >= 0);
+ Debug.Assert(toFree <= count);
+ int index = nextWrite - count;
+ if (index < 0)
+ {
+ index += positions.Length;
+ }
+ for (int i = 0; i < toFree; i++)
+ {
+ if (index == positions.Length)
+ {
+ index = 0;
+ }
+ //System.out.println(" fb idx=" + index);
+ positions[index].Reset();
+ index++;
+ }
+ count -= toFree;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
new file mode 100644
index 0000000..5fe99d5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs
@@ -0,0 +1,100 @@
+using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="JapaneseTokenizer"/>.
+ /// <code>
+ /// <fieldType name="text_ja" class="solr.TextField">
+ /// <analyzer>
+ /// <tokenizer class="solr.JapaneseTokenizerFactory"
+ /// mode="NORMAL"
+ /// userDictionary="user.txt"
+ /// userDictionaryEncoding="UTF-8"
+ /// discardPunctuation="true"
+ /// />
+ /// <filter class="solr.JapaneseBaseFormFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class JapaneseTokenizerFactory : TokenizerFactory, IResourceLoaderAware
+ {
+ private static readonly string MODE = "mode";
+
+ private static readonly string USER_DICT_PATH = "userDictionary";
+
+ private static readonly string USER_DICT_ENCODING = "userDictionaryEncoding";
+
+ private static readonly string DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option
+
+ private UserDictionary userDictionary;
+
+ private readonly JapaneseTokenizerMode mode;
+ private readonly bool discardPunctuation;
+ private readonly string userDictionaryPath;
+ private readonly string userDictionaryEncoding;
+
+ /// <summary>Creates a new <see cref="JapaneseTokenizerFactory"/>.</summary>
+ public JapaneseTokenizerFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ Enum.TryParse(Get(args, MODE, JapaneseTokenizer.DEFAULT_MODE.ToString()), true, out mode);
+ userDictionaryPath = Get(args, USER_DICT_PATH);
+ userDictionaryEncoding = Get(args, USER_DICT_ENCODING);
+ discardPunctuation = GetBoolean(args, DISCARD_PUNCTUATION, true);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public virtual void Inform(IResourceLoader loader)
+ {
+ if (userDictionaryPath != null)
+ {
+ Stream stream = loader.OpenResource(userDictionaryPath);
+ string encoding = userDictionaryEncoding;
+ if (encoding == null)
+ {
+ encoding = Encoding.UTF8.WebName;
+ }
+ Encoding decoder = Encoding.GetEncoding(encoding);
+ TextReader reader = new StreamReader(stream, decoder);
+ userDictionary = new UserDictionary(reader);
+ }
+ else
+ {
+ userDictionary = null;
+ }
+ }
+
+ public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+ {
+ return new JapaneseTokenizer(factory, input, userDictionary, discardPunctuation, mode);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj
new file mode 100644
index 0000000..782aad3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj
@@ -0,0 +1,118 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProjectGuid>{8408625A-2508-46D5-8519-045183C43724}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>Lucene.Net.Analysis.Ja</RootNamespace>
+ <AssemblyName>Lucene.Net.Analysis.Kuromoji</AssemblyName>
+ <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="System" />
+ <Reference Include="System.Core" />
+ <Reference Include="Microsoft.CSharp" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Dict\BinaryDictionary.cs" />
+ <Compile Include="Dict\CharacterDefinition.cs" />
+ <Compile Include="Dict\ConnectionCosts.cs" />
+ <Compile Include="Dict\Dictionary.cs" />
+ <Compile Include="Dict\TokenInfoDictionary.cs" />
+ <Compile Include="Dict\TokenInfoFST.cs" />
+ <Compile Include="Dict\UnknownDictionary.cs" />
+ <Compile Include="Dict\UserDictionary.cs" />
+ <Compile Include="GraphvizFormatter.cs" />
+ <Compile Include="JapaneseAnalyzer.cs" />
+ <Compile Include="JapaneseBaseFormFilter.cs" />
+ <Compile Include="JapaneseBaseFormFilterFactory.cs" />
+ <Compile Include="JapaneseIterationMarkCharFilter.cs" />
+ <Compile Include="JapaneseIterationMarkCharFilterFactory.cs" />
+ <Compile Include="JapaneseKatakanaStemFilter.cs" />
+ <Compile Include="JapaneseKatakanaStemFilterFactory.cs" />
+ <Compile Include="JapanesePartOfSpeechStopFilter.cs" />
+ <Compile Include="JapanesePartOfSpeechStopFilterFactory.cs" />
+ <Compile Include="JapaneseReadingFormFilter.cs" />
+ <Compile Include="JapaneseReadingFormFilterFactory.cs" />
+ <Compile Include="JapaneseTokenizer.cs" />
+ <Compile Include="JapaneseTokenizerFactory.cs" />
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ <Compile Include="Token.cs" />
+ <Compile Include="TokenAttributes\BaseFormAttribute.cs" />
+ <Compile Include="TokenAttributes\BaseFormAttributeImpl.cs" />
+ <Compile Include="TokenAttributes\InflectionAttribute.cs" />
+ <Compile Include="TokenAttributes\InflectionAttributeImpl.cs" />
+ <Compile Include="TokenAttributes\PartOfSpeechAttribute.cs" />
+ <Compile Include="TokenAttributes\PartOfSpeechAttributeImpl.cs" />
+ <Compile Include="TokenAttributes\ReadingAttribute.cs" />
+ <Compile Include="TokenAttributes\ReadingAttributeImpl.cs" />
+ <Compile Include="Tools\BinaryDictionaryWriter.cs" />
+ <Compile Include="Tools\CharacterDefinitionWriter.cs" />
+ <Compile Include="Tools\ConnectionCostsBuilder.cs" />
+ <Compile Include="Tools\ConnectionCostsWriter.cs" />
+ <Compile Include="Tools\DictionaryBuilder.cs" />
+ <Compile Include="Tools\TokenInfoDictionaryBuilder.cs" />
+ <Compile Include="Tools\TokenInfoDictionaryWriter.cs" />
+ <Compile Include="Tools\UnknownDictionaryBuilder.cs" />
+ <Compile Include="Tools\UnknownDictionaryWriter.cs" />
+ <Compile Include="Util\CSVUtil.cs" />
+ <Compile Include="Util\ToStringUtil.cs" />
+ <Compile Include="..\CommonAssemblyInfo.cs">
+ <Link>Properties\CommonAssemblyInfo.cs</Link>
+ </Compile>
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj">
+ <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project>
+ <Name>Lucene.Net.Analysis.Common</Name>
+ </ProjectReference>
+ <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj">
+ <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+ <Name>Lucene.Net</Name>
+ </ProjectReference>
+ </ItemGroup>
+ <ItemGroup>
+ <EmbeddedResource Include="Dict\CharacterDefinition.dat" />
+ <EmbeddedResource Include="Dict\ConnectionCosts.dat" />
+ <EmbeddedResource Include="Dict\TokenInfoDictionary%24buffer.dat" />
+ <EmbeddedResource Include="Dict\TokenInfoDictionary%24fst.dat" />
+ <EmbeddedResource Include="Dict\TokenInfoDictionary%24posDict.dat" />
+ <EmbeddedResource Include="Dict\TokenInfoDictionary%24targetMap.dat" />
+ <EmbeddedResource Include="Dict\UnknownDictionary%24buffer.dat" />
+ <EmbeddedResource Include="Dict\UnknownDictionary%24posDict.dat" />
+ <EmbeddedResource Include="Dict\UnknownDictionary%24targetMap.dat" />
+ </ItemGroup>
+ <ItemGroup>
+ <EmbeddedResource Include="stoptags.txt" />
+ <EmbeddedResource Include="stopwords.txt" />
+ </ItemGroup>
+ <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json
new file mode 100644
index 0000000..86d1c12
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json
@@ -0,0 +1,8 @@
+{
+ "runtimes": {
+ "win": {}
+ },
+ "frameworks": {
+ "net451": {}
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj
new file mode 100644
index 0000000..59a3016
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+-->
+<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup>
+ <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion>
+ <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath>
+ </PropertyGroup>
+ <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" />
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>87e54ca7-7394-4705-a99a-0dd638265c56</ProjectGuid>
+ <RootNamespace>Lucene.Net.Analysis.Ja</RootNamespace>
+ <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath>
+ <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath>
+ </PropertyGroup>
+ <PropertyGroup>
+ <SchemaVersion>2.0</SchemaVersion>
+ </PropertyGroup>
+ <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" />
+</Project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..2ea0e44
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs
@@ -0,0 +1,46 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analysis.Kuromoji")]
+[assembly: AssemblyDescription("Japanese Morphological Analyzer " +
+ "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyDefaultAlias("Lucene.Net.Analysis.Kuromoji")]
+[assembly: AssemblyCulture("")]
+
+[assembly: CLSCompliant(true)]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("8408625a-2508-46d5-8519-045183c43724")]
+
+// for testing
+[assembly: InternalsVisibleTo("Lucene.Net.Tests.Analysis.Kuromoji")]
+
+// NOTE: Version information is in CommonAssemblyInfo.cs
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Token.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Token.cs b/src/Lucene.Net.Analysis.Kuromoji/Token.cs
new file mode 100644
index 0000000..fd1afd2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Token.cs
@@ -0,0 +1,194 @@
+using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Support;
+using System.Diagnostics.CodeAnalysis;
+
+namespace Lucene.Net.Analysis.Ja
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Analyzed token with morphological data from its dictionary.
+ /// </summary>
+ public class Token
+ {
+ private readonly IDictionary dictionary;
+
+ private readonly int wordId;
+
+ private readonly char[] surfaceForm;
+ private readonly int offset;
+ private readonly int length;
+
+ private readonly int position;
+ private int positionLength;
+
+ private readonly JapaneseTokenizerType type;
+
+ public Token(int wordId, char[] surfaceForm, int offset, int length, JapaneseTokenizerType type, int position, IDictionary dictionary)
+ {
+ this.wordId = wordId;
+ this.surfaceForm = surfaceForm;
+ this.offset = offset;
+ this.length = length;
+ this.type = type;
+ this.position = position;
+ this.dictionary = dictionary;
+ }
+
+ public override string ToString()
+ {
+ return "Token(\"" + new string(surfaceForm, offset, length) + "\" pos=" + position + " length=" + length +
+ " posLen=" + positionLength + " type=" + type + " wordId=" + wordId +
+ " leftID=" + dictionary.GetLeftId(wordId) + ")";
+ }
+
+ /// <summary>
+ /// surfaceForm
+ /// </summary>
+ [WritableArray]
+ [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")]
+ public virtual char[] SurfaceForm
+ {
+ get { return surfaceForm; }
+ }
+
+ /// <summary>
+ /// offset into surfaceForm
+ /// </summary>
+ public virtual int Offset
+ {
+ get { return offset; }
+ }
+
+ /// <summary>
+ /// length of surfaceForm
+ /// </summary>
+ public virtual int Length
+ {
+ get { return length; }
+ }
+
+ /// <summary>
+ /// surfaceForm as a String
+ /// </summary>
+ /// <returns>surfaceForm as a String</returns>
+ public virtual string GetSurfaceFormString()
+ {
+ return new string(surfaceForm, offset, length);
+ }
+
+ /// <summary>
+ /// reading. <c>null</c> if token doesn't have reading.
+ /// </summary>
+ /// <returns>reading. <c>null</c> if token doesn't have reading.</returns>
+ public virtual string GetReading()
+ {
+ return dictionary.GetReading(wordId, surfaceForm, offset, length);
+ }
+
+ /// <summary>
+ /// pronunciation. <c>null</c> if token doesn't have pronunciation.
+ /// </summary>
+ /// <returns>pronunciation. <c>null</c> if token doesn't have pronunciation.</returns>
+ public virtual string GetPronunciation()
+ {
+ return dictionary.GetPronunciation(wordId, surfaceForm, offset, length);
+ }
+
+ /// <summary>
+ /// part of speech.
+ /// </summary>
+ /// <returns>part of speech.</returns>
+ public virtual string GetPartOfSpeech()
+ {
+ return dictionary.GetPartOfSpeech(wordId);
+ }
+
+ /// <summary>
+ /// inflection type or <c>null</c>
+ /// </summary>
+ /// <returns>inflection type or <c>null</c></returns>
+ public virtual string GetInflectionType()
+ {
+ return dictionary.GetInflectionType(wordId);
+ }
+
+ /// <summary>
+ /// inflection form or <c>null</c>
+ /// </summary>
+ /// <returns>inflection form or <c>null</c></returns>
+ public virtual string GetInflectionForm()
+ {
+ return dictionary.GetInflectionForm(wordId);
+ }
+
+ /// <summary>
+ /// base form or <c>null</c> if token is not inflected
+ /// </summary>
+ /// <returns>base form or <c>null</c> if token is not inflected</returns>
+ public virtual string GetBaseForm()
+ {
+ return dictionary.GetBaseForm(wordId, surfaceForm, offset, length);
+ }
+
+ /// <summary>
+ /// Returns <c>true</c> if this token is known word.
+ /// </summary>
+ /// <returns><c>true</c> if this token is in standard dictionary. <c>false</c> if not.</returns>
+ public virtual bool IsKnown()
+ {
+ return type == JapaneseTokenizerType.KNOWN;
+ }
+
+ /// <summary>
+ /// Returns <c>true</c> if this token is unknown word.
+ /// </summary>
+ /// <returns><c>true</c> if this token is unknown word. <c>false</c> if not.</returns>
+ public virtual bool IsUnknown()
+ {
+ return type == JapaneseTokenizerType.UNKNOWN;
+ }
+
+ /// <summary>
+ /// Returns <c>true</c> if this token is defined in user dictionary.
+ /// </summary>
+ /// <returns><c>true</c> if this token is in user dictionary. <c>false</c> if not.</returns>
+ public virtual bool IsUser()
+ {
+ return type == JapaneseTokenizerType.USER;
+ }
+
+ /// <summary>
+ /// Get index of this token in input text. Returns position of token.
+ /// </summary>
+ public virtual int Position
+ {
+ get { return position; }
+ }
+
+ /// <summary>
+ /// Gets or Sets the length (in tokens) of this token. For normal
+ /// tokens this is 1; for compound tokens it's > 1.
+ /// </summary>
+ public virtual int PositionLength
+ {
+ get { return positionLength; }
+ set { this.positionLength = value; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs
new file mode 100644
index 0000000..e3a06b3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs
@@ -0,0 +1,33 @@
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Attribute for <see cref="Token.BaseForm"/>
+ /// <para/>
+ /// Note: depending on part of speech, this value may not be applicable,
+ /// and will be null.
+ /// </summary>
+ public interface IBaseFormAttribute : IAttribute
+ {
+ string GetBaseForm();
+ void SetToken(Token token);
+ }
+}