You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/23 17:36:36 UTC
[11/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

Ported Lucene.Net.Analysis.Kuromoji + tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/0f092010
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/0f092010
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/0f092010

Branch: refs/heads/master
Commit: 0f092010450cec325f541c5d6e404fd5f3b77a83
Parents: e67244a
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Sun Jul 23 19:39:51 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Mon Jul 24 00:35:27 2017 +0700

----------------------------------------------------------------------
 CONTRIBUTING.md                                 |    6 +-
 Lucene.Net.Portable.sln                         |   20 +
 Lucene.Net.sln                                  |   52 +
 README.md                                       |    9 +-
 .../Dict/BinaryDictionary.cs                    |  330 ++++
 .../Dict/CharacterDefinition.cs                 |  124 ++
 .../Dict/CharacterDefinition.dat                |  Bin 0 -> 65568 bytes
 .../Dict/ConnectionCosts.cs                     |   90 ++
 .../Dict/ConnectionCosts.dat                    |  Bin 0 -> 2624540 bytes
 .../Dict/Dictionary.cs                          |  106 ++
 .../Dict/TokenInfoDictionary$buffer.dat         |  Bin 0 -> 4337216 bytes
 .../Dict/TokenInfoDictionary$fst.dat            |  Bin 0 -> 1716198 bytes
 .../Dict/TokenInfoDictionary$posDict.dat        |  Bin 0 -> 54870 bytes
 .../Dict/TokenInfoDictionary$targetMap.dat      |  Bin 0 -> 392165 bytes
 .../Dict/TokenInfoDictionary.cs                 |   72 +
 .../Dict/TokenInfoFST.cs                        |  118 ++
 .../Dict/UnknownDictionary$buffer.dat           |  Bin 0 -> 311 bytes
 .../Dict/UnknownDictionary$posDict.dat          |  Bin 0 -> 4111 bytes
 .../Dict/UnknownDictionary$targetMap.dat        |  Bin 0 -> 69 bytes
 .../Dict/UnknownDictionary.cs                   |  100 ++
 .../Dict/UserDictionary.cs                      |  300 ++++
 .../GraphvizFormatter.cs                        |  197 +++
 .../JapaneseAnalyzer.cs                         |  103 ++
 .../JapaneseBaseFormFilter.cs                   |   65 +
 .../JapaneseBaseFormFilterFactory.cs            |   52 +
 .../JapaneseIterationMarkCharFilter.cs          |  500 ++++++
 .../JapaneseIterationMarkCharFilterFactory.cs   |   66 +
 .../JapaneseKatakanaStemFilter.cs               |  111 ++
 .../JapaneseKatakanaStemFilterFactory.cs        |   61 +
 .../JapanesePartOfSpeechStopFilter.cs           |   61 +
 .../JapanesePartOfSpeechStopFilterFactory.cs    |   85 +
 .../JapaneseReadingFormFilter.cs                |   89 ++
 .../JapaneseReadingFormFilterFactory.cs         |   57 +
 .../JapaneseTokenizer.cs                        | 1489 ++++++++++++++++++
 .../JapaneseTokenizerFactory.cs                 |  100 ++
 .../Lucene.Net.Analysis.Kuromoji.csproj         |  118 ++
 .../Lucene.Net.Analysis.Kuromoji.project.json   |    8 +
 .../Lucene.Net.Analysis.Kuromoji.xproj          |   38 +
 .../Properties/AssemblyInfo.cs                  |   46 +
 src/Lucene.Net.Analysis.Kuromoji/Token.cs       |  194 +++
 .../TokenAttributes/BaseFormAttribute.cs        |   33 +
 .../TokenAttributes/BaseFormAttributeImpl.cs    |   55 +
 .../TokenAttributes/InflectionAttribute.cs      |   34 +
 .../TokenAttributes/InflectionAttributeImpl.cs  |   68 +
 .../TokenAttributes/PartOfSpeechAttribute.cs    |   30 +
 .../PartOfSpeechAttributeImpl.cs                |   59 +
 .../TokenAttributes/ReadingAttribute.cs         |   34 +
 .../TokenAttributes/ReadingAttributeImpl.cs     |   68 +
 .../Tools/BinaryDictionaryWriter.cs             |  370 +++++
 .../Tools/CharacterDefinitionWriter.cs          |   91 ++
 .../Tools/ConnectionCostsBuilder.cs             |   68 +
 .../Tools/ConnectionCostsWriter.cs              |   74 +
 .../Tools/DictionaryBuilder.cs                  |   92 ++
 .../Tools/TokenInfoDictionaryBuilder.cs         |  230 +++
 .../Tools/TokenInfoDictionaryWriter.cs          |   51 +
 .../Tools/UnknownDictionaryBuilder.cs           |  146 ++
 .../Tools/UnknownDictionaryWriter.cs            |   66 +
 .../Util/CSVUtil.cs                             |  124 ++
 .../Util/ToStringUtil.cs                        | 1401 ++++++++++++++++
 src/Lucene.Net.Analysis.Kuromoji/project.json   |   60 +
 src/Lucene.Net.Analysis.Kuromoji/stoptags.txt   |  420 +++++
 src/Lucene.Net.Analysis.Kuromoji/stopwords.txt  |  127 ++
 .../Dict/TestTokenInfoDictionary.cs             |  114 ++
 .../Dict/UserDictionaryTest.cs                  |   90 ++
 .../Lucene.Net.Tests.Analysis.Kuromoji.csproj   |  106 ++
 ...ene.Net.Tests.Analysis.Kuromoji.project.json |   11 +
 .../Lucene.Net.Tests.Analysis.Kuromoji.xproj    |   41 +
 .../Properties/AssemblyInfo.cs                  |   38 +
 .../StringMockResourceLoader.cs                 |   67 +
 .../Support/TestApiConsistency.cs               |  150 ++
 .../Support/TestExceptionSerialization.cs       |   54 +
 .../TestExtendedMode.cs                         |   82 +
 .../TestJapaneseAnalyzer.cs                     |  229 +++
 .../TestJapaneseBaseFormFilter.cs               |   84 +
 .../TestJapaneseBaseFormFilterFactory.cs        |   60 +
 .../TestJapaneseIterationMarkCharFilter.cs      |  241 +++
 ...estJapaneseIterationMarkCharFilterFactory.cs |  108 ++
 .../TestJapaneseKatakanaStemFilter.cs           |  100 ++
 .../TestJapaneseKatakanaStemFilterFactory.cs    |   62 +
 ...TestJapanesePartOfSpeechStopFilterFactory.cs |   70 +
 .../TestJapaneseReadingFormFilter.cs            |  109 ++
 .../TestJapaneseReadingFormFilterFactory.cs     |   59 +
 .../TestJapaneseTokenizer.cs                    |  846 ++++++++++
 .../TestJapaneseTokenizerFactory.cs             |  134 ++
 .../TestSearchMode.cs                           |   92 ++
 .../Tools/UnknownDictionaryTest.cs              |   93 ++
 .../Util/TestToStringUtil.cs                    |  121 ++
 .../bocchan.utf-8                               |    1 +
 .../project.json                                |   43 +
 .../search-segmentation-tests.txt               |  142 ++
 .../userdict.txt                                |   10 +
 src/Lucene.Net/Support/Collections.cs           |    9 +
 92 files changed, 11827 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/CONTRIBUTING.md
----------------------------------------------------------------------
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 226e681..fa2942c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,11 +36,7 @@ helpers to help with that, see for examples see our [Java style methods to avoid
 
 Note that even though we are currently a port of Lucene 4.8.0, we recommend porting over new work from 4.8.1. We hope to begin the work of upgrading to 4.8.1 soon (let us know if interested). There are only about 100 files that changed between 4.8.0 and 4.8.1.
 
-### Pending being ported from scratch (code + tests)
-
-* [Lucene.Net.Analysis.Kuromoji](https://github.com/apache/lucene-solr/tree/releases/lucene-solr/4.8.1/lucene/analysis/kuromoji) - See [JIRA issue 567](https://issues.apache.org/jira/browse/LUCENENET-567)
-
-### Pending being ported from scratch (code + tests), but have additional dependencies that also either need to be sourced from the .NET ecosystem or ported.
+### Pending being ported from scratch (code + tests) plus have additional dependencies that either need to be sourced from the .NET ecosystem or ported.
 
 * [Lucene.Net.Benchmark](https://github.com/apache/lucene-solr/tree/releases/lucene-solr/4.8.1/lucene/benchmark) - See [JIRA issue 564](https://issues.apache.org/jira/browse/LUCENENET-564)
 * [Lucene.Net.Analysis.Morfologik](https://github.com/apache/lucene-solr/tree/releases/lucene-solr/4.8.1/lucene/analysis/morfologik) - See [JIRA issue 568](https://issues.apache.org/jira/browse/LUCENENET-568)

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/Lucene.Net.Portable.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.Portable.sln b/Lucene.Net.Portable.sln
index bac9168..e94a262 100644
--- a/Lucene.Net.Portable.sln
+++ b/Lucene.Net.Portable.sln
@@ -103,6 +103,10 @@ Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "lucene-cli", "src\tools\luc
 EndProject
 Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.Cli", "src\tools\Lucene.Net.Tests.Cli\Lucene.Net.Tests.Cli.xproj", "{495B65F0-0B01-40FE-9DC8-5A82C49E07EF}"
 EndProject
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Analysis.Kuromoji", "src\Lucene.Net.Analysis.Kuromoji\Lucene.Net.Analysis.Kuromoji.xproj", "{87E54CA7-7394-4705-A99A-0DD638265C56}"
+EndProject
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.Analysis.Kuromoji", "src\Lucene.Net.Tests.Analysis.Kuromoji\Lucene.Net.Tests.Analysis.Kuromoji.xproj", "{F82F0F31-09E7-48FB-B5FF-F3A84627A307}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -471,6 +475,22 @@ Global
 		{495B65F0-0B01-40FE-9DC8-5A82C49E07EF}.Release|Any CPU.Build.0 = Release|Any CPU
 		{495B65F0-0B01-40FE-9DC8-5A82C49E07EF}.Release|x86.ActiveCfg = Release|Any CPU
 		{495B65F0-0B01-40FE-9DC8-5A82C49E07EF}.Release|x86.Build.0 = Release|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Debug|x86.Build.0 = Debug|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Release|Any CPU.Build.0 = Release|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Release|x86.ActiveCfg = Release|Any CPU
+		{87E54CA7-7394-4705-A99A-0DD638265C56}.Release|x86.Build.0 = Release|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Debug|x86.Build.0 = Debug|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Release|Any CPU.Build.0 = Release|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Release|x86.ActiveCfg = Release|Any CPU
+		{F82F0F31-09E7-48FB-B5FF-F3A84627A307}.Release|x86.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/Lucene.Net.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.sln b/Lucene.Net.sln
index a187ccc..5450020 100644
--- a/Lucene.Net.sln
+++ b/Lucene.Net.sln
@@ -106,6 +106,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Demo", "src\Luce
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Demo", "src\Lucene.Net.Tests.Demo\Lucene.Net.Tests.Demo.csproj", "{571B361E-B0D4-445E-A0BC-1A24AA184258}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analysis.Kuromoji", "src\Lucene.Net.Analysis.Kuromoji\Lucene.Net.Analysis.Kuromoji.csproj", "{8408625A-2508-46D5-8519-045183C43724}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Analysis.Kuromoji", "src\Lucene.Net.Tests.Analysis.Kuromoji\Lucene.Net.Tests.Analysis.Kuromoji.csproj", "{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -1059,6 +1063,54 @@ Global
 		{571B361E-B0D4-445E-A0BC-1A24AA184258}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
 		{571B361E-B0D4-445E-A0BC-1A24AA184258}.Release35|x86.ActiveCfg = Release|Any CPU
 		{571B361E-B0D4-445E-A0BC-1A24AA184258}.Release35|x86.Build.0 = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug|x86.Build.0 = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug35|Any CPU.Build.0 = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug35|x86.ActiveCfg = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Debug35|x86.Build.0 = Debug|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release|Any CPU.Build.0 = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release|x86.ActiveCfg = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release|x86.Build.0 = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release35|Any CPU.ActiveCfg = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release35|Any CPU.Build.0 = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release35|x86.ActiveCfg = Release|Any CPU
+		{8408625A-2508-46D5-8519-045183C43724}.Release35|x86.Build.0 = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug|x86.Build.0 = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug35|Any CPU.Build.0 = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug35|x86.ActiveCfg = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Debug35|x86.Build.0 = Debug|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release|Any CPU.Build.0 = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release|x86.ActiveCfg = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release|x86.Build.0 = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|Any CPU.ActiveCfg = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|Any CPU.Build.0 = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|x86.ActiveCfg = Release|Any CPU
+		{34A2BCE8-1351-43BD-A365-F50E7C0B2C49}.Release35|x86.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index ee56fce..3d0df77 100644
--- a/README.md
+++ b/README.md
@@ -41,8 +41,13 @@ PM> Install-Package Lucene.Net -Pre
 ```
 
 As of 4.8.0, Lucene.Net is now divided into several specialized sub-packages, all available on NuGet.
-<!--- TO BE ADDED WHEN RELEASED - [Lucene.Net.Analysis.Phonetic](https://www.nuget.org/packages/Lucene.Net.Analysis.Phonetic/) - Analyzer for indexing phonetic signatures (for sounds-alike search) ) 
+
+<!--- TO BE ADDED WHEN RELEASED 
+- [Lucene.Net.Analysis.Kuromoji](https://www.nuget.org/packages/Lucene.Net.Analysis.Kuromoji/) - Japanese Morphological Analyzer 
+- [Lucene.Net.Analysis.Phonetic](https://www.nuget.org/packages/Lucene.Net.Analysis.Phonetic/) - Analyzer for indexing phonetic signatures (for sounds-alike search)
 - [Lucene.Net.Analysis.SmartCn](https://www.nuget.org/packages/Lucene.Net.Analysis.SmartCn/) - Analyzer for indexing Chinese)-->
+
+
 - [Lucene.Net](https://www.nuget.org/packages/Lucene.Net/) - Core library
 - [Lucene.Net.Analysis.Common](https://www.nuget.org/packages/Lucene.Net.Analysis.Common/) - Analyzers for indexing content in different languages and domains
 - [Lucene.Net.Analysis.Stempel](https://www.nuget.org/packages/Lucene.Net.Analysis.Stempel/) - Analyzer for indexing Polish
@@ -52,7 +57,7 @@ As of 4.8.0, Lucene.Net is now divided into several specialized sub-packages, al
 - [Lucene.Net.Facet](https://www.nuget.org/packages/Lucene.Net.Facet/) - Faceted indexing and search capabilities
 - [Lucene.Net.Grouping](https://www.nuget.org/packages/Lucene.Net.Grouping/) - Collectors for grouping search results
 - [Lucene.Net.Highlighter](https://www.nuget.org/packages/Lucene.Net.Highlighter/) - Highlights search keywords in results
-- [Lucene.Net.ICU](https://www.nuget.org/packages/Lucene.Net.ICU/) - Specialized international support for languages that don't space words
+- [Lucene.Net.ICU](https://www.nuget.org/packages/Lucene.Net.ICU/) - Specialized ICU (International Components for Unicode) Analyzers and Highlighters
 - [Lucene.Net.Join](https://www.nuget.org/packages/Lucene.Net.Join/) - Index-time and Query-time joins for normalized content
 - [Lucene.Net.Memory](https://www.nuget.org/packages/Lucene.Net.Memory/) - Single-document in-memory index implementation
 - [Lucene.Net.Misc](https://www.nuget.org/packages/Lucene.Net.Misc/) - Index tools and other miscellaneous code

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/BinaryDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/BinaryDictionary.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/BinaryDictionary.cs
new file mode 100644
index 0000000..4068e38
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/BinaryDictionary.cs
@@ -0,0 +1,330 @@
+using Lucene.Net.Codecs;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+using System.Reflection;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Base class for a binary-encoded in-memory dictionary.
+    /// </summary>
+    public abstract class BinaryDictionary : IDictionary
+    {
+        public static readonly string DICT_FILENAME_SUFFIX = "$buffer.dat";
+        public static readonly string TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
+        public static readonly string POSDICT_FILENAME_SUFFIX = "$posDict.dat";
+
+        public static readonly string DICT_HEADER = "kuromoji_dict";
+        public static readonly string TARGETMAP_HEADER = "kuromoji_dict_map";
+        public static readonly string POSDICT_HEADER = "kuromoji_dict_pos";
+        public static readonly int VERSION = 1;
+
+        private readonly ByteBuffer buffer;
+        private readonly int[] targetMapOffsets, targetMap;
+        private readonly string[] posDict;
+        private readonly string[] inflTypeDict;
+        private readonly string[] inflFormDict;
+
+        protected BinaryDictionary()
+        {
+            int[] targetMapOffsets = null, targetMap = null;
+            string[] posDict = null;
+            string[] inflFormDict = null;
+            string[] inflTypeDict = null;
+            ByteBuffer buffer = null;
+
+            using (Stream mapIS = GetResource(TARGETMAP_FILENAME_SUFFIX))
+            {
+                DataInput @in = new InputStreamDataInput(mapIS);
+                CodecUtil.CheckHeader(@in, TARGETMAP_HEADER, VERSION, VERSION);
+                targetMap = new int[@in.ReadVInt32()];
+                targetMapOffsets = new int[@in.ReadVInt32()];
+                int accum = 0, sourceId = 0;
+                for (int ofs = 0; ofs < targetMap.Length; ofs++)
+                {
+                    int val = @in.ReadVInt32();
+                    if ((val & 0x01) != 0)
+                    {
+                        targetMapOffsets[sourceId] = ofs;
+                        sourceId++;
+                    }
+                    accum += (int)((uint)val) >> 1;
+                    targetMap[ofs] = accum;
+                }
+                if (sourceId + 1 != targetMapOffsets.Length)
+                    throw new IOException("targetMap file format broken");
+                targetMapOffsets[sourceId] = targetMap.Length;
+            }
+
+            using (Stream posIS = GetResource(POSDICT_FILENAME_SUFFIX))
+            {
+                DataInput @in = new InputStreamDataInput(posIS);
+                CodecUtil.CheckHeader(@in, POSDICT_HEADER, VERSION, VERSION);
+                int posSize = @in.ReadVInt32();
+                posDict = new string[posSize];
+                inflTypeDict = new string[posSize];
+                inflFormDict = new string[posSize];
+                for (int j = 0; j < posSize; j++)
+                {
+                    posDict[j] = @in.ReadString();
+                    inflTypeDict[j] = @in.ReadString();
+                    inflFormDict[j] = @in.ReadString();
+                    // this is how we encode null inflections
+                    if (inflTypeDict[j].Length == 0)
+                    {
+                        inflTypeDict[j] = null;
+                    }
+                    if (inflFormDict[j].Length == 0)
+                    {
+                        inflFormDict[j] = null;
+                    }
+                }
+            }
+
+            ByteBuffer tmpBuffer;
+
+            using (Stream dictIS = GetResource(DICT_FILENAME_SUFFIX))
+            {
+                // no buffering here, as we load in one large buffer
+                DataInput @in = new InputStreamDataInput(dictIS);
+                CodecUtil.CheckHeader(@in, DICT_HEADER, VERSION, VERSION);
+                int size = @in.ReadVInt32();
+                tmpBuffer = ByteBuffer.Allocate(size); // AllocateDirect..?
+                int read = dictIS.Read(tmpBuffer.Array, 0, size);
+                if (read != size)
+                {
+                    throw new EndOfStreamException("Cannot read whole dictionary");
+                }
+            }
+            buffer = tmpBuffer.AsReadOnlyBuffer();
+
+            this.targetMap = targetMap;
+            this.targetMapOffsets = targetMapOffsets;
+            this.posDict = posDict;
+            this.inflTypeDict = inflTypeDict;
+            this.inflFormDict = inflFormDict;
+            this.buffer = buffer;
+        }
+
+        protected Stream GetResource(string suffix)
+        {
+            return GetTypeResource(GetType(), suffix);
+        }
+
+        // util, reused by ConnectionCosts and CharacterDefinition
+        public static Stream GetTypeResource(Type clazz, string suffix)
+        {
+            Stream @is = clazz.GetTypeInfo().Assembly.FindAndGetManifestResourceStream(clazz, clazz.Name + suffix);
+            if (@is == null)
+                throw new FileNotFoundException("Not in assembly: " + clazz.FullName + suffix);
+            return @is;
+        }
+
+        public virtual void LookupWordIds(int sourceId, Int32sRef @ref)
+        {
+            @ref.Int32s = targetMap;
+            @ref.Offset = targetMapOffsets[sourceId];
+            // targetMapOffsets always has one more entry pointing behind last:
+            @ref.Length = targetMapOffsets[sourceId + 1] - @ref.Offset;
+        }
+
+        public virtual int GetLeftId(int wordId)
+        {
+            return (short)((ushort)buffer.GetInt16(wordId)) >> 3;
+        }
+
+        public virtual int GetRightId(int wordId)
+        {
+            return (short)((ushort)buffer.GetInt16(wordId)) >> 3;
+        }
+
+        public virtual int GetWordCost(int wordId)
+        {
+            return buffer.GetInt16(wordId + 2);  // Skip id
+        }
+
+        public virtual string GetBaseForm(int wordId, char[] surfaceForm, int off, int len)
+        {
+            if (HasBaseFormData(wordId))
+            {
+                int offset = BaseFormOffset(wordId);
+                int data = buffer.Get(offset++) & 0xff;
+                int prefix = (int)((uint)data) >> 4;
+                int suffix = data & 0xF;
+                char[] text = new char[prefix + suffix];
+                System.Array.Copy(surfaceForm, off, text, 0, prefix);
+                for (int i = 0; i < suffix; i++)
+                {
+                    text[prefix + i] = buffer.GetChar(offset + (i << 1));
+                }
+                return new string(text);
+            }
+            else
+            {
+                return null;
+            }
+        }
+
+        public virtual string GetReading(int wordId, char[] surface, int off, int len)
+        {
+            if (HasReadingData(wordId))
+            {
+                int offset = ReadingOffset(wordId);
+                int readingData = buffer.Get(offset++) & 0xff;
+                return ReadString(offset, (int)((uint)readingData) >> 1, (readingData & 1) == 1);
+            }
+            else
+            {
+                // the reading is the surface form, with hiragana shifted to katakana
+                char[] text = new char[len];
+                for (int i = 0; i < len; i++)
+                {
+                    char ch = surface[off + i];
+                    if (ch > 0x3040 && ch < 0x3097)
+                    {
+                        text[i] = (char)(ch + 0x60);
+                    }
+                    else
+                    {
+                        text[i] = ch;
+                    }
+                }
+                return new string(text);
+            }
+        }
+
+        public virtual string GetPartOfSpeech(int wordId)
+        {
+            return posDict[GetLeftId(wordId)];
+        }
+
+        public virtual string GetPronunciation(int wordId, char[] surface, int off, int len)
+        {
+            if (HasPronunciationData(wordId))
+            {
+                int offset = PronunciationOffset(wordId);
+                int pronunciationData = buffer.Get(offset++) & 0xff;
+                return ReadString(offset, (int)((uint)pronunciationData) >> 1, (pronunciationData & 1) == 1);
+            }
+            else
+            {
+                return GetReading(wordId, surface, off, len); // same as the reading
+            }
+        }
+
+        public virtual string GetInflectionType(int wordId)
+        {
+            return inflTypeDict[GetLeftId(wordId)];
+        }
+
+        public virtual string GetInflectionForm(int wordId)
+        {
+            return inflFormDict[GetLeftId(wordId)];
+        }
+
+        private static int BaseFormOffset(int wordId)
+        {
+            return wordId + 4;
+        }
+
+        private int ReadingOffset(int wordId)
+        {
+            int offset = BaseFormOffset(wordId);
+            if (HasBaseFormData(wordId))
+            {
+                int baseFormLength = buffer.Get(offset++) & 0xf;
+                return offset + (baseFormLength << 1);
+            }
+            else
+            {
+                return offset;
+            }
+        }
+
+        private int PronunciationOffset(int wordId)
+        {
+            if (HasReadingData(wordId))
+            {
+                int offset = ReadingOffset(wordId);
+                int readingData = buffer.Get(offset++) & 0xff;
+                int readingLength;
+                if ((readingData & 1) == 0)
+                {
+                    readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+                }
+                else
+                {
+                    readingLength = (int)((uint)readingData) >> 1;
+                }
+                return offset + readingLength;
+            }
+            else
+            {
+                return ReadingOffset(wordId);
+            }
+        }
+
+        private bool HasBaseFormData(int wordId)
+        {
+            return (buffer.GetInt16(wordId) & HAS_BASEFORM) != 0;
+        }
+
+        private bool HasReadingData(int wordId)
+        {
+            return (buffer.GetInt16(wordId) & HAS_READING) != 0;
+        }
+
+        private bool HasPronunciationData(int wordId)
+        {
+            return (buffer.GetInt16(wordId) & HAS_PRONUNCIATION) != 0;
+        }
+
+        private string ReadString(int offset, int length, bool kana)
+        {
+            char[] text = new char[length];
+            if (kana)
+            {
+                for (int i = 0; i < length; i++)
+                {
+                    text[i] = (char)(0x30A0 + (buffer.Get(offset + i) & 0xff));
+                }
+            }
+            else
+            {
+                for (int i = 0; i < length; i++)
+                {
+                    text[i] = buffer.GetChar(offset + (i << 1));
+                }
+            }
+            return new string(text);
+        }
+
+        /// <summary>flag that the entry has baseform data. otherwise its not inflected (same as surface form)</summary>
+        public static readonly int HAS_BASEFORM = 1;
+        /// <summary>flag that the entry has reading data. otherwise reading is surface form converted to katakana</summary>
+        public static readonly int HAS_READING = 2;
+        /// <summary>flag that the entry has pronunciation data. otherwise pronunciation is the reading</summary>
+        public static readonly int HAS_PRONUNCIATION = 4;
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.cs
new file mode 100644
index 0000000..2821941
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.cs
@@ -0,0 +1,124 @@
+using Lucene.Net.Codecs;
+using Lucene.Net.Store;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Character category data.
+    /// </summary>
+    public sealed class CharacterDefinition
+    {
+        public static readonly string FILENAME_SUFFIX = ".dat";
+        public static readonly string HEADER = "kuromoji_cd";
+        public static readonly int VERSION = 1;
+
+        public static readonly int CLASS_COUNT = Enum.GetValues(typeof(CharacterClass)).Length;
+
+        // only used internally for lookup:
+        private enum CharacterClass : byte
+        {
+            NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC
+        }
+
+        private readonly byte[] characterCategoryMap = new byte[0x10000];
+
+        private readonly bool[] invokeMap = new bool[CLASS_COUNT];
+        private readonly bool[] groupMap = new bool[CLASS_COUNT];
+
+        // the classes:
+        public static readonly byte NGRAM = (byte)CharacterClass.NGRAM;
+        public static readonly byte DEFAULT = (byte)CharacterClass.DEFAULT;
+        public static readonly byte SPACE = (byte)CharacterClass.SPACE;
+        public static readonly byte SYMBOL = (byte)CharacterClass.SYMBOL;
+        public static readonly byte NUMERIC = (byte)CharacterClass.NUMERIC;
+        public static readonly byte ALPHA = (byte)CharacterClass.ALPHA;
+        public static readonly byte CYRILLIC = (byte)CharacterClass.CYRILLIC;
+        public static readonly byte GREEK = (byte)CharacterClass.GREEK;
+        public static readonly byte HIRAGANA = (byte)CharacterClass.HIRAGANA;
+        public static readonly byte KATAKANA = (byte)CharacterClass.KATAKANA;
+        public static readonly byte KANJI = (byte)CharacterClass.KANJI;
+        public static readonly byte KANJINUMERIC = (byte)CharacterClass.KANJINUMERIC;
+
+        private CharacterDefinition()
+        {
+            using (Stream @is = BinaryDictionary.GetTypeResource(GetType(), FILENAME_SUFFIX))
+            {
+                DataInput @in = new InputStreamDataInput(@is);
+                CodecUtil.CheckHeader(@in, HEADER, VERSION, VERSION);
+                @in.ReadBytes(characterCategoryMap, 0, characterCategoryMap.Length);
+                for (int i = 0; i < CLASS_COUNT; i++)
+                {
+                    byte b = @in.ReadByte();
+                    invokeMap[i] = (b & 0x01) != 0;
+                    groupMap[i] = (b & 0x02) != 0;
+                }
+            }
+        }
+
+        public byte GetCharacterClass(char c)
+        {
+            return characterCategoryMap[c];
+        }
+
+        public bool IsInvoke(char c)
+        {
+            return invokeMap[characterCategoryMap[c]];
+        }
+
+        public bool IsGroup(char c)
+        {
+            return groupMap[characterCategoryMap[c]];
+        }
+
+        public bool IsKanji(char c)
+        {
+            byte characterClass = characterCategoryMap[c];
+            return characterClass == KANJI || characterClass == KANJINUMERIC;
+        }
+
+        public static byte LookupCharacterClass(string characterClassName)
+        {
+            return (byte)Enum.Parse(typeof(CharacterClass), characterClassName, true);
+        }
+
+        public static CharacterDefinition GetInstance()
+        {
+            return SingletonHolder.INSTANCE;
+        }
+
+        private class SingletonHolder
+        {
+            internal static readonly CharacterDefinition INSTANCE;
+            static SingletonHolder()
+            {
+                try
+                {
+                    INSTANCE = new CharacterDefinition();
+                }
+                catch (IOException ioe)
+                {
+                    throw new Exception("Cannot load CharacterDefinition.", ioe);
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.dat
new file mode 100644
index 0000000..4b8bd4b
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/CharacterDefinition.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.cs
new file mode 100644
index 0000000..02d8eb5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.cs
@@ -0,0 +1,90 @@
+using Lucene.Net.Codecs;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// n-gram connection cost data
+    /// </summary>
+    public sealed class ConnectionCosts
+    {
+        public static readonly string FILENAME_SUFFIX = ".dat";
+        public static readonly string HEADER = "kuromoji_cc";
+        public static readonly int VERSION = 1;
+
+        private readonly short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
+
+        private ConnectionCosts()
+        {
+            short[][] costs = null;
+
+            using (Stream @is = BinaryDictionary.GetTypeResource(GetType(), FILENAME_SUFFIX))
+            {
+                DataInput @in = new InputStreamDataInput(@is);
+                CodecUtil.CheckHeader(@in, HEADER, VERSION, VERSION);
+                int forwardSize = @in.ReadVInt32();
+                int backwardSize = @in.ReadVInt32();
+                costs = RectangularArrays.ReturnRectangularArray<short>(backwardSize, forwardSize);
+                int accum = 0;
+                for (int j = 0; j < costs.Length; j++)
+                {
+                    short[] a = costs[j];
+                    for (int i = 0; i < a.Length; i++)
+                    {
+                        int raw = @in.ReadVInt32();
+                        accum += ((int)((uint)raw) >> 1) ^ -(raw & 1);
+                        a[i] = (short)accum;
+                    }
+                }
+            }
+
+            this.costs = costs;
+        }
+
+        public int Get(int forwardId, int backwardId)
+        {
+            return costs[backwardId][forwardId];
+        }
+
+        public static ConnectionCosts GetInstance()
+        {
+            return SingletonHolder.INSTANCE;
+        }
+
+        private class SingletonHolder
+        {
+            internal static readonly ConnectionCosts INSTANCE;
+            static SingletonHolder()
+            {
+                try
+                {
+                    INSTANCE = new ConnectionCosts();
+                }
+                catch (IOException ioe)
+                {
+                    throw new Exception("Cannot load ConnectionCosts.", ioe);
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.dat
new file mode 100644
index 0000000..7679f14
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/ConnectionCosts.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/Dictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/Dictionary.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/Dictionary.cs
new file mode 100644
index 0000000..4c24a4e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/Dictionary.cs
@@ -0,0 +1,106 @@
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Dictionary interface for retrieving morphological data
+    /// by id.
+    /// </summary>
+    public interface IDictionary
+    {
+        /// <summary>
+        /// Get left id of specified word.
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <returns>Left id.</returns>
+        int GetLeftId(int wordId);
+
+        /// <summary>
+        /// Get right id of specified word.
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <returns>Right id.</returns>
+        int GetRightId(int wordId);
+
+        /// <summary>
+        /// Get word cost of specified word
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <returns>Word's cost.</returns>
+        int GetWordCost(int wordId);
+
+        /// <summary>
+        /// Get Part-Of-Speech of tokens
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <returns>Part-Of-Speech of the token.</returns>
+        string GetPartOfSpeech(int wordId);
+
+        /// <summary>
+        /// Get reading of tokens.
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <param name="surface"></param>
+        /// <param name="off"></param>
+        /// <param name="len"></param>
+        /// <returns>Reading of the token.</returns>
+        string GetReading(int wordId, char[] surface, int off, int len);
+
+        /// <summary>
+        /// Get base form of word.
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <param name="surface"></param>
+        /// <param name="off"></param>
+        /// <param name="len"></param>
+        /// <returns>Base form (only different for inflected words, otherwise null).</returns>
+        string GetBaseForm(int wordId, char[] surface, int off, int len);
+
+        /// <summary>
+        /// Get pronunciation of tokens
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <param name="surface"></param>
+        /// <param name="off"></param>
+        /// <param name="len"></param>
+        /// <returns>Pronunciation of the token.</returns>
+        string GetPronunciation(int wordId, char[] surface, int off, int len);
+
+        /// <summary>
+        /// Get inflection type of tokens.
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <returns>Inflection type, or null.</returns>
+        string GetInflectionType(int wordId);
+
+        /// <summary>
+        /// Get inflection form of tokens.
+        /// </summary>
+        /// <param name="wordId">Word ID of token.</param>
+        /// <returns>Inflection form, or null.</returns>
+        string GetInflectionForm(int wordId);
+        // TODO: maybe we should have a optimal method, a non-typesafe
+        // 'getAdditionalData' if other dictionaries like unidic have additional data
+    }
+
+    // LUCENENT TODO: Make this whole thing into an abstact class??
+    public class Dictionary
+    {
+        public static readonly string INTERNAL_SEPARATOR = "\u0000";
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$buffer.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$buffer.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$buffer.dat
new file mode 100644
index 0000000..dcf430a
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$buffer.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$fst.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$fst.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$fst.dat
new file mode 100644
index 0000000..ea5c43c
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$fst.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$posDict.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$posDict.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$posDict.dat
new file mode 100644
index 0000000..e727d90
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$posDict.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$targetMap.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$targetMap.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$targetMap.dat
new file mode 100644
index 0000000..0e27345
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary$targetMap.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary.cs
new file mode 100644
index 0000000..d46312b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoDictionary.cs
@@ -0,0 +1,72 @@
+using Lucene.Net.Store;
+using Lucene.Net.Util.Fst;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Binary dictionary implementation for a known-word dictionary model:
+    /// Words are encoded into an FST mapping to a list of wordIDs.
+    /// </summary>
+    public sealed class TokenInfoDictionary : BinaryDictionary
+    {
+        public static readonly string FST_FILENAME_SUFFIX = "$fst.dat";
+
+        private readonly TokenInfoFST fst;
+
+        private TokenInfoDictionary()
+        {
+            FST<long?> fst = null;
+            using (Stream @is = GetResource(FST_FILENAME_SUFFIX))
+            {
+                fst = new FST<long?>(new InputStreamDataInput(@is), PositiveInt32Outputs.Singleton);
+            }
+            // TODO: some way to configure?
+            this.fst = new TokenInfoFST(fst, true);
+        }
+
+        public TokenInfoFST FST
+        {
+            get { return fst; }
+        }
+
+        public static TokenInfoDictionary GetInstance()
+        {
+            return SingletonHolder.INSTANCE;
+        }
+
+        private class SingletonHolder
+        {
+            internal static readonly TokenInfoDictionary INSTANCE;
+            static SingletonHolder()
+            {
+                try
+                {
+                    INSTANCE = new TokenInfoDictionary();
+                }
+                catch (IOException ioe)
+                {
+                    throw new Exception("Cannot load TokenInfoDictionary.", ioe);
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoFST.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoFST.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoFST.cs
new file mode 100644
index 0000000..dffdfbb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/TokenInfoFST.cs
@@ -0,0 +1,118 @@
+using Lucene.Net.Util.Fst;
+using System.Diagnostics;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Thin wrapper around an FST with root-arc caching for Japanese.
+    /// <para/>
+    /// Depending upon fasterButMoreRam, either just kana (191 arcs),
+    /// or kana and han (28,607 arcs) are cached. The latter offers
+    /// additional performance at the cost of more RAM.
+    /// </summary>
+    public sealed class TokenInfoFST
+    {
+        private readonly FST<long?> fst;
+
+        // depending upon fasterButMoreRam, we cache root arcs for either 
+        // kana (0x3040-0x30FF) or kana + han (0x3040-0x9FFF)
+        // false: 191 arcs
+        // true:  28,607 arcs (costs ~1.5MB)
+        private readonly int cacheCeiling;
+        private readonly FST.Arc<long?>[] rootCache;
+
+        private readonly long? NO_OUTPUT;
+
+        // LUCENENET specific - made field private
+        // and added public property for reading it.
+        public long? NoOutput
+        {
+            get { return NO_OUTPUT; }
+        }
+
+        public TokenInfoFST(FST<long?> fst, bool fasterButMoreRam)
+        {
+            this.fst = fst;
+            this.cacheCeiling = fasterButMoreRam ? 0x9FFF : 0x30FF;
+            NO_OUTPUT = fst.Outputs.NoOutput;
+            rootCache = CacheRootArcs();
+        }
+
+        private FST.Arc<long?>[] CacheRootArcs()
+        {
+            FST.Arc<long?>[] rootCache = new FST.Arc<long?>[1 + (cacheCeiling - 0x3040)];
+            FST.Arc<long?> firstArc = new FST.Arc<long?>();
+            fst.GetFirstArc(firstArc);
+            FST.Arc<long?> arc = new FST.Arc<long?>();
+            FST.BytesReader fstReader = fst.GetBytesReader();
+            // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
+            for (int i = 0; i < rootCache.Length; i++)
+            {
+                if (fst.FindTargetArc(0x3040 + i, firstArc, arc, fstReader) != null)
+                {
+                    rootCache[i] = new FST.Arc<long?>().CopyFrom(arc);
+                }
+            }
+            return rootCache;
+        }
+
+        public FST.Arc<long?> FindTargetArc(int ch, FST.Arc<long?> follow, FST.Arc<long?> arc, bool useCache, FST.BytesReader fstReader)
+        {
+            if (useCache && ch >= 0x3040 && ch <= cacheCeiling)
+            {
+                Debug.Assert(ch != FST.END_LABEL);
+                FST.Arc<long?> result = rootCache[ch - 0x3040];
+                if (result == null)
+                {
+                    return null;
+                }
+                else
+                {
+                    arc.CopyFrom(result);
+                    return arc;
+                }
+            }
+            else
+            {
+                return fst.FindTargetArc(ch, follow, arc, fstReader);
+            }
+        }
+
+        public FST.Arc<long?> GetFirstArc(FST.Arc<long?> arc)
+        {
+            return fst.GetFirstArc(arc);
+        }
+
+        public FST.BytesReader GetBytesReader()
+        {
+            return fst.GetBytesReader();
+        }
+
+        /// <summary>
+        /// for testing only
+        /// <para/>
+        /// @lucene.internal 
+        /// </summary>
+        internal FST<long?> InternalFST
+        {
+            get { return fst; }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$buffer.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$buffer.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$buffer.dat
new file mode 100644
index 0000000..16f0a82
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$buffer.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$posDict.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$posDict.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$posDict.dat
new file mode 100644
index 0000000..e709dcc
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$posDict.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$targetMap.dat
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$targetMap.dat b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$targetMap.dat
new file mode 100644
index 0000000..e8db0b3
Binary files /dev/null and b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary$targetMap.dat differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary.cs
new file mode 100644
index 0000000..364576b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/UnknownDictionary.cs
@@ -0,0 +1,100 @@
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Dictionary for unknown-word handling.
+    /// </summary>
+    public class UnknownDictionary : BinaryDictionary
+    {
+        private readonly CharacterDefinition characterDefinition = CharacterDefinition.GetInstance();
+
+        private UnknownDictionary()
+        {
+        }
+
+        public virtual int Lookup(char[] text, int offset, int len)
+        {
+            if (!characterDefinition.IsGroup(text[offset]))
+            {
+                return 1;
+            }
+
+            // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+            byte characterIdOfFirstCharacter = characterDefinition.GetCharacterClass(text[offset]);
+            int length = 1;
+            for (int i = 1; i < len; i++)
+            {
+                if (characterIdOfFirstCharacter == characterDefinition.GetCharacterClass(text[offset + i]))
+                {
+                    length++;
+                }
+                else
+                {
+                    break;
+                }
+            }
+
+            return length;
+        }
+
+        public virtual CharacterDefinition CharacterDefinition
+        {
+            get { return characterDefinition; }
+        }
+
+        public override string GetReading(int wordId, char[] surface, int off, int len)
+        {
+            return null;
+        }
+
+        public override string GetInflectionType(int wordId)
+        {
+            return null;
+        }
+
+        public override string GetInflectionForm(int wordId)
+        {
+            return null;
+        }
+
+        public static UnknownDictionary GetInstance()
+        {
+            return SingletonHolder.INSTANCE;
+        }
+
+        private class SingletonHolder
+        {
+            internal static readonly UnknownDictionary INSTANCE;
+            static SingletonHolder()
+            {
+                try
+                {
+                    INSTANCE = new UnknownDictionary();
+                }
+                catch (IOException ioe)
+                {
+                    throw new Exception("Cannot load UnknownDictionary.", ioe);
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Dict/UserDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Dict/UserDictionary.cs b/src/Lucene.Net.Analysis.Kuromoji/Dict/UserDictionary.cs
new file mode 100644
index 0000000..3fb2b09
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Dict/UserDictionary.cs
@@ -0,0 +1,300 @@
+using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja.Dict
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Class for building a User Dictionary.
+    /// This class allows for custom segmentation of phrases.
+    /// </summary>
+    public sealed class UserDictionary : IDictionary
+    {
+        // phrase text -> phrase ID
+        private readonly TokenInfoFST fst;
+
+        // holds wordid, length, length... indexed by phrase ID
+        private readonly int[][] segmentations;
+
+        // holds readings and POS, indexed by wordid
+        private readonly string[] data;
+
+        private static readonly int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
+
+        public static readonly int WORD_COST = -100000;
+
+        public static readonly int LEFT_ID = 5;
+
+        public static readonly int RIGHT_ID = 5;
+
+        private static readonly Regex specialChars = new Regex(@"#.*$", RegexOptions.Compiled);
+        private static readonly Regex commentLine = new Regex(@"  *", RegexOptions.Compiled);
+
+        public UserDictionary(TextReader reader)
+        {
+            string line = null;
+            int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
+            List<string[]> featureEntries = new List<string[]>();
+
+            // text, segmentation, readings, POS
+            while ((line = reader.ReadLine()) != null)
+            {
+                // Remove comments
+                line = specialChars.Replace(line, "");
+
+                // Skip empty lines or comment lines
+                if (line.Trim().Length == 0)
+                {
+                    continue;
+                }
+                string[] values = CSVUtil.Parse(line);
+                featureEntries.Add(values);
+            }
+
+            // TODO: should we allow multiple segmentations per input 'phrase'?
+            // the old treemap didn't support this either, and i'm not sure if its needed/useful?
+            featureEntries.Sort(new ComparerAnonymousHelper());
+
+            List<string> data = new List<string>(featureEntries.Count);
+            List<int[]> segmentations = new List<int[]>(featureEntries.Count);
+
+            PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton;
+            Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
+            Int32sRef scratch = new Int32sRef();
+            long ord = 0;
+
+            foreach (string[] values in featureEntries)
+            {
+                string[] segmentation = commentLine.Replace(values[1], " ").Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+                string[] readings = commentLine.Replace(values[2], " ").Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
+                string pos = values[3];
+
+                if (segmentation.Length != readings.Length)
+                {
+                    throw new Exception("Illegal user dictionary entry " + values[0] +
+                                               " - the number of segmentations (" + segmentation.Length + ")" +
+                                               " does not the match number of readings (" + readings.Length + ")");
+                }
+
+                int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
+                wordIdAndLength[0] = wordId;
+                for (int i = 0; i < segmentation.Length; i++)
+                {
+                    wordIdAndLength[i + 1] = segmentation[i].Length;
+                    data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
+                    wordId++;
+                }
+                // add mapping to FST
+                string token = values[0];
+                scratch.Grow(token.Length);
+                scratch.Length = token.Length;
+                for (int i = 0; i < token.Length; i++)
+                {
+                    scratch.Int32s[i] = (int)token[i];
+                }
+                fstBuilder.Add(scratch, ord);
+                segmentations.Add(wordIdAndLength);
+                ord++;
+            }
+            this.fst = new TokenInfoFST(fstBuilder.Finish(), false);
+            this.data = data.ToArray(/*new string[data.Count]*/);
+            this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/);
+        }
+
+        // LUCENENET TODO: Make an AnonymousComparer class in Support and
+        // replace all of these classes.
+        private class ComparerAnonymousHelper : IComparer<string[]>
+        {
+            public int Compare(string[] left, string[] right)
+            {
+                return left[0].CompareToOrdinal(right[0]);
+            }
+        }
+
+        /// <summary>
+        /// Lookup words in text.
+        /// </summary>
+        /// <param name="chars">Text.</param>
+        /// <param name="off">Offset into text.</param>
+        /// <param name="len">Length of text.</param>
+        /// <returns>Array of {wordId, position, length}.</returns>
+        public int[][] Lookup(char[] chars, int off, int len)
+        {
+            // TODO: can we avoid this treemap/toIndexArray?
+            TreeDictionary<int, int[]> result = new TreeDictionary<int, int[]>(); // index, [length, length...]
+            bool found = false; // true if we found any results
+
+            FST.BytesReader fstReader = fst.GetBytesReader();
+
+            FST.Arc<long?> arc = new FST.Arc<long?>();
+            int end = off + len;
+            for (int startOffset = off; startOffset < end; startOffset++)
+            {
+                arc = fst.GetFirstArc(arc);
+                int output = 0;
+                int remaining = end - startOffset;
+                for (int i = 0; i < remaining; i++)
+                {
+                    int ch = chars[startOffset + i];
+                    if (fst.FindTargetArc(ch, arc, arc, i == 0, fstReader) == null)
+                    {
+                        break; // continue to next position
+                    }
+                    output += (int)arc.Output;
+                    if (arc.IsFinal)
+                    {
+                        int finalOutput = output + (int)arc.NextFinalOutput;
+                        result[startOffset - off] = segmentations[finalOutput];
+                        found = true;
+                    }
+                }
+            }
+
+            return found ? ToIndexArray(result) : EMPTY_RESULT;
+        }
+
+        public TokenInfoFST FST
+        {
+            get { return fst; }
+        }
+
+        private static readonly int[][] EMPTY_RESULT = new int[0][];
+
+        /// <summary>
+        /// Convert Map of index and wordIdAndLength to array of {wordId, index, length}
+        /// </summary>
+        /// <param name="input"></param>
+        /// <returns>Array of {wordId, index, length}.</returns>
+        private int[][] ToIndexArray(TreeDictionary<int, int[]> input)
+        {
+            List<int[]> result = new List<int[]>();
+            foreach (int i in input.Keys)
+            {
+                int[] wordIdAndLength = input[i];
+                int wordId = wordIdAndLength[0];
+                // convert length to index
+                int current = i;
+                for (int j = 1; j < wordIdAndLength.Length; j++)
+                { // first entry is wordId offset
+                    int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
+                    result.Add(token);
+                    current += wordIdAndLength[j];
+                }
+            }
+            return result.ToArray(/*new int[result.size()][]*/);
+        }
+
+        public int[] LookupSegmentation(int phraseID)
+        {
+            return segmentations[phraseID];
+        }
+
+        public int GetLeftId(int wordId)
+        {
+            return LEFT_ID;
+        }
+
+        public int GetRightId(int wordId)
+        {
+            return RIGHT_ID;
+        }
+
+        public int GetWordCost(int wordId)
+        {
+            return WORD_COST;
+        }
+
+        public string GetReading(int wordId, char[] surface, int off, int len)
+        {
+            return GetFeature(wordId, 0);
+        }
+
+        public string GetPartOfSpeech(int wordId)
+        {
+            return GetFeature(wordId, 1);
+        }
+
+        public string GetBaseForm(int wordId, char[] surface, int off, int len)
+        {
+            return null; // TODO: add support?
+        }
+
+        public string GetPronunciation(int wordId, char[] surface, int off, int len)
+        {
+            return null; // TODO: add support?
+        }
+
+        public string GetInflectionType(int wordId)
+        {
+            return null; // TODO: add support?
+        }
+
+        public string GetInflectionForm(int wordId)
+        {
+            return null; // TODO: add support?
+        }
+
+        private string[] GetAllFeaturesArray(int wordId)
+        {
+            string allFeatures = data[wordId - CUSTOM_DICTIONARY_WORD_ID_OFFSET];
+            if (allFeatures == null)
+            {
+                return null;
+            }
+
+            return allFeatures.Split(new string[] { Dictionary.INTERNAL_SEPARATOR }, StringSplitOptions.RemoveEmptyEntries);
+        }
+
+        private string GetFeature(int wordId, params int[] fields)
+        {
+            string[] allFeatures = GetAllFeaturesArray(wordId);
+            if (allFeatures == null)
+            {
+                return null;
+            }
+            StringBuilder sb = new StringBuilder();
+            if (fields.Length == 0)
+            { // All features
+                foreach (string feature in allFeatures)
+                {
+                    sb.Append(CSVUtil.QuoteEscape(feature)).Append(",");
+                }
+            }
+            else if (fields.Length == 1)
+            { // One feature doesn't need to escape value
+                sb.Append(allFeatures[fields[0]]).Append(",");
+            }
+            else
+            {
+                foreach (int field in fields)
+                {
+                    sb.Append(CSVUtil.QuoteEscape(allFeatures[field])).Append(",");
+                }
+            }
+            return sb.Remove(sb.Length - 1, 1).ToString();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/GraphvizFormatter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/GraphvizFormatter.cs b/src/Lucene.Net.Analysis.Kuromoji/GraphvizFormatter.cs
new file mode 100644
index 0000000..bd5233b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/GraphvizFormatter.cs
@@ -0,0 +1,197 @@
+using Lucene.Net.Analysis.Ja.Dict;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    // TODO: would be nice to show 2nd best path in a diff't
+    // color...
+
+    /// <summary>
+    /// Outputs the dot (graphviz) string for the viterbi lattice.
+    /// </summary>
+    public class GraphvizFormatter
+    {
+        private readonly static string BOS_LABEL = "BOS";
+
+        private readonly static string EOS_LABEL = "EOS";
+
+        private readonly static string FONT_NAME = "Helvetica";
+
+        private readonly ConnectionCosts costs;
+
+        private readonly IDictionary<string, string> bestPathMap;
+
+        private readonly StringBuilder sb = new StringBuilder();
+
+        public GraphvizFormatter(ConnectionCosts costs)
+        {
+            this.costs = costs;
+            this.bestPathMap = new Dictionary<string, string>();
+            sb.Append(FormatHeader());
+            sb.Append("  init [style=invis]\n");
+            sb.Append("  init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
+        }
+
+        public virtual string Finish()
+        {
+            sb.Append(FormatTrailer());
+            return sb.ToString();
+        }
+
+        // Backtraces another incremental fragment:
+        internal void OnBacktrace(JapaneseTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, bool isEnd)
+        {
+            SetBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
+            sb.Append(FormatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+            if (isEnd)
+            {
+                sb.Append("  fini [style=invis]\n");
+                sb.Append("  ");
+                sb.Append(GetNodeID(endPosData.pos, fromIDX));
+                sb.Append(" -> fini [label=\"" + EOS_LABEL + "\"]");
+            }
+        }
+
+        // Records which arcs make up the best bath:
+        private void SetBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX)
+        {
+            bestPathMap.Clear();
+
+            int pos = endPosData.pos;
+            int bestIDX = fromIDX;
+            while (pos > startPos)
+            {
+                Position posData = positions.Get(pos);
+
+                int backPos = posData.backPos[bestIDX];
+                int backIDX = posData.backIndex[bestIDX];
+
+                string toNodeID = GetNodeID(pos, bestIDX);
+                string fromNodeID = GetNodeID(backPos, backIDX);
+
+                Debug.Assert(!bestPathMap.ContainsKey(fromNodeID));
+                Debug.Assert(!bestPathMap.Values.Contains(toNodeID));
+                bestPathMap[fromNodeID] = toNodeID;
+                pos = backPos;
+                bestIDX = backIDX;
+            }
+        }
+
+        private string FormatNodes(JapaneseTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment)
+        {
+            StringBuilder sb = new StringBuilder();
+            // Output nodes
+            for (int pos = startPos + 1; pos <= endPosData.pos; pos++)
+            {
+                Position posData = positions.Get(pos);
+                for (int idx = 0; idx < posData.count; idx++)
+                {
+                    sb.Append("  ");
+                    sb.Append(GetNodeID(pos, idx));
+                    sb.Append(" [label=\"");
+                    sb.Append(pos);
+                    sb.Append(": ");
+                    sb.Append(posData.lastRightID[idx]);
+                    sb.Append("\"]\n");
+                }
+            }
+
+            // Output arcs
+            for (int pos = endPosData.pos; pos > startPos; pos--)
+            {
+                Position posData = positions.Get(pos);
+                for (int idx = 0; idx < posData.count; idx++)
+                {
+                    Position backPosData = positions.Get(posData.backPos[idx]);
+                    string toNodeID = GetNodeID(pos, idx);
+                    string fromNodeID = GetNodeID(posData.backPos[idx], posData.backIndex[idx]);
+
+                    sb.Append("  ");
+                    sb.Append(fromNodeID);
+                    sb.Append(" -> ");
+                    sb.Append(toNodeID);
+
+                    string attrs;
+                    string path;
+                    bestPathMap.TryGetValue(fromNodeID, out path);
+                    if (toNodeID.Equals(path))
+                    {
+                        // This arc is on best path
+                        attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
+                    }
+                    else
+                    {
+                        attrs = "";
+                    }
+
+                    IDictionary dict = tok.GetDict(posData.backType[idx]);
+                    int wordCost = dict.GetWordCost(posData.backID[idx]);
+                    int bgCost = costs.Get(backPosData.lastRightID[posData.backIndex[idx]],
+                                                 dict.GetLeftId(posData.backID[idx]));
+
+                    string surfaceForm = new string(fragment,
+                                                          posData.backPos[idx] - startPos,
+                                                          pos - posData.backPos[idx]);
+
+                    sb.Append(" [label=\"");
+                    sb.Append(surfaceForm);
+                    sb.Append(' ');
+                    sb.Append(wordCost);
+                    if (bgCost >= 0)
+                    {
+                        sb.Append('+');
+                    }
+                    sb.Append(bgCost);
+                    sb.Append("\"");
+                    sb.Append(attrs);
+                    sb.Append("]\n");
+                }
+            }
+            return sb.ToString();
+        }
+
+        private string FormatHeader()
+        {
+            StringBuilder sb = new StringBuilder();
+            sb.Append("digraph viterbi {\n");
+            sb.Append("  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
+            //sb.Append("  // A2 paper size\n");
+            //sb.Append("  size = \"34.4,16.5\";\n");
+            //sb.Append("  // try to fill paper\n");
+            //sb.Append("  ratio = fill;\n");
+            sb.Append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+            sb.Append("  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+
+            return sb.ToString();
+        }
+
+        private string FormatTrailer()
+        {
+            return "}";
+        }
+
+        private string GetNodeID(int pos, int idx)
+        {
+            return pos + "." + idx;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseAnalyzer.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseAnalyzer.cs
new file mode 100644
index 0000000..dccf5ad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseAnalyzer.cs
@@ -0,0 +1,103 @@
+using Lucene.Net.Analysis.Cjk;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Analyzer for Japanese that uses morphological analysis.
+    /// </summary>
+    /// <seealso cref="JapaneseTokenizer"/>
+    public class JapaneseAnalyzer : StopwordAnalyzerBase
+    {
+        private readonly JapaneseTokenizerMode mode;
+        private readonly ISet<string> stoptags;
+        private readonly UserDictionary userDict;
+
+        public JapaneseAnalyzer(LuceneVersion matchVersion)
+            : this(matchVersion, null, JapaneseTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS)
+        {
+        }
+
+        public JapaneseAnalyzer(LuceneVersion matchVersion, UserDictionary userDict, JapaneseTokenizerMode mode, CharArraySet stopwords, ISet<string> stoptags)
+            : base(matchVersion, stopwords)
+        {
+            this.userDict = userDict;
+            this.mode = mode;
+            this.stoptags = stoptags;
+        }
+
+        public static CharArraySet GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        public static ISet<string> GetDefaultStopTags()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_TAGS;
+        }
+
+        /// <summary>
+        /// Atomically loads DEFAULT_STOP_SET, DEFAULT_STOP_TAGS in a lazy fashion once the 
+        /// outer class accesses the static final set the first time.
+        /// </summary>
+        private static class DefaultSetHolder
+        {
+            internal static readonly CharArraySet DEFAULT_STOP_SET;
+            internal static readonly ISet<string> DEFAULT_STOP_TAGS;
+
+            static DefaultSetHolder()
+            {
+                try
+                {
+                    DEFAULT_STOP_SET = LoadStopwordSet(true, typeof(JapaneseAnalyzer), "stopwords.txt", "#");  // ignore case
+                    CharArraySet tagset = LoadStopwordSet(false, typeof(JapaneseAnalyzer), "stoptags.txt", "#");
+                    DEFAULT_STOP_TAGS = new HashSet<string>();
+                    foreach (string element in tagset)
+                    {
+                        DEFAULT_STOP_TAGS.Add(element);
+                    }
+                }
+                catch (IOException ex)
+                {
+                    // default set should always be present as it is part of the distribution (JAR)
+                    throw new Exception("Unable to load default stopword or stoptag set", ex);
+                }
+            }
+        }
+
+        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+        {
+            Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
+            TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
+            stream = new JapanesePartOfSpeechStopFilter(m_matchVersion, stream, stoptags);
+            stream = new CJKWidthFilter(stream);
+            stream = new StopFilter(m_matchVersion, stream, m_stopwords);
+            stream = new JapaneseKatakanaStemFilter(stream);
+            stream = new LowerCaseFilter(m_matchVersion, stream);
+            return new TokenStreamComponents(tokenizer, stream);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilter.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilter.cs
new file mode 100644
index 0000000..2117737
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseBaseFormFilter.cs
@@ -0,0 +1,65 @@
+using Lucene.Net.Analysis.Ja.TokenAttributes;
+using Lucene.Net.Analysis.TokenAttributes;
+
+namespace Lucene.Net.Analysis.Ja
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Replaces term text with the <see cref="IBaseFormAttribute"/>.
+    /// <para/>
+    /// This acts as a lemmatizer for verbs and adjectives.
+    /// To prevent terms from being stemmed use an instance of
+    /// <see cref="SetKeywordMarkerFilter"/> or a custom <see cref="TokenFilter"/> that sets
+    /// the <see cref="IKeywordAttribute"/> before this <see cref="TokenStream"/>.
+    /// </summary>
+    public sealed class JapaneseBaseFormFilter : TokenFilter
+    {
+        private readonly ICharTermAttribute termAtt;
+        private readonly IBaseFormAttribute basicFormAtt;
+        private readonly IKeywordAttribute keywordAtt;
+
+        public JapaneseBaseFormFilter(TokenStream input)
+            : base(input)
+        {
+            this.termAtt = AddAttribute<ICharTermAttribute>();
+            this.basicFormAtt = AddAttribute<IBaseFormAttribute>();
+            this.keywordAtt = AddAttribute<IKeywordAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            if (m_input.IncrementToken())
+            {
+                if (!keywordAtt.IsKeyword)
+                {
+                    string baseForm = basicFormAtt.GetBaseForm();
+                    if (baseForm != null)
+                    {
+                        termAtt.SetEmpty().Append(baseForm);
+                    }
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}