You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/06/24 19:47:16 UTC
[3/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests
Ported Lucene.Net.Analysis.SmartCn + tests
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/2f5d89b4
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/2f5d89b4
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/2f5d89b4
Branch: refs/heads/master
Commit: 2f5d89b4ae979d376f2ada22b2fb2e775b6e1608
Parents: 468199e
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Sun Jun 25 01:26:30 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Sun Jun 25 02:28:26 2017 +0700
----------------------------------------------------------------------
Lucene.Net.Portable.sln | 20 +
Lucene.Net.sln | 52 ++
.../AnalyzerProfile.cs | 183 +++++
src/Lucene.Net.Analysis.SmartCn/CharType.cs | 67 ++
.../HHMM/AbstractDictionary.cs | 224 ++++++
.../HHMM/BiSegGraph.cs | 256 ++++++
.../HHMM/BigramDictionary.cs | 431 ++++++++++
.../HHMM/HHMMSegmenter.cs | 252 ++++++
.../HHMM/PathNode.cs | 80 ++
.../HHMM/SegGraph.cs | 160 ++++
.../HHMM/SegToken.cs | 123 +++
.../HHMM/SegTokenFilter.cs | 75 ++
.../HHMM/SegTokenPair.cs | 95 +++
.../HHMM/WordDictionary.cs | 778 +++++++++++++++++++
.../HHMM/bigramdict.mem | Bin 0 -> 4825652 bytes
.../HHMM/coredict.mem | Bin 0 -> 1178248 bytes
.../HMMChineseTokenizer.cs | 94 +++
.../HMMChineseTokenizerFactory.cs | 56 ++
.../Lucene.Net.Analysis.SmartCn.csproj | 124 +++
.../Lucene.Net.Analysis.SmartCn.project.json | 11 +
.../Lucene.Net.Analysis.SmartCn.xproj | 40 +
.../Properties/AssemblyInfo.cs | 42 +
.../SentenceTokenizer.cs | 142 ++++
.../SmartChineseAnalyzer.cs | 171 ++++
.../SmartChineseSentenceTokenizerFactory.cs | 52 ++
.../SmartChineseWordTokenFilterFactory.cs | 55 ++
src/Lucene.Net.Analysis.SmartCn/Utility.cs | 196 +++++
.../WordSegmenter.cs | 89 +++
.../WordTokenFilter.cs | 114 +++
src/Lucene.Net.Analysis.SmartCn/WordType.cs | 67 ++
src/Lucene.Net.Analysis.SmartCn/project.json | 53 ++
src/Lucene.Net.Analysis.SmartCn/stopwords.txt | 59 ++
.../Lucene.Net.Tests.Analysis.SmartCn.csproj | 105 +++
...cene.Net.Tests.Analysis.SmartCn.project.json | 11 +
.../Lucene.Net.Tests.Analysis.SmartCn.xproj | 42 +
.../Properties/AssemblyInfo.cs | 37 +
.../Support/TestApiConsistency.cs | 148 ++++
.../Support/TestExceptionSerialization.cs | 54 ++
.../TestHMMChineseTokenizerFactory.cs | 72 ++
.../TestSmartChineseAnalyzer.cs | 354 +++++++++
.../TestSmartChineseFactories.cs | 98 +++
.../project.json | 45 ++
42 files changed, 5127 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/Lucene.Net.Portable.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.Portable.sln b/Lucene.Net.Portable.sln
index 9a49572..d3678ee 100644
--- a/Lucene.Net.Portable.sln
+++ b/Lucene.Net.Portable.sln
@@ -85,6 +85,10 @@ Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.ICU", "src\Lucen
EndProject
Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.ICU", "src\Lucene.Net.Tests.ICU\Lucene.Net.Tests.ICU.xproj", "{32FD3471-E862-4055-B969-79C12A656366}"
EndProject
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Analysis.SmartCn", "src\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.xproj", "{A400916E-DCB8-4A16-BE83-91891C05191F}"
+EndProject
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.Analysis.SmartCn", "src\Lucene.Net.Tests.Analysis.SmartCn\Lucene.Net.Tests.Analysis.SmartCn.xproj", "{2870FB52-1239-493F-A0BE-951660194A66}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -389,6 +393,22 @@ Global
{32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.Build.0 = Release|Any CPU
{32FD3471-E862-4055-B969-79C12A656366}.Release|x86.ActiveCfg = Release|Any CPU
{32FD3471-E862-4055-B969-79C12A656366}.Release|x86.Build.0 = Release|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Debug|x86.Build.0 = Debug|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Release|Any CPU.Build.0 = Release|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Release|x86.ActiveCfg = Release|Any CPU
+ {2870FB52-1239-493F-A0BE-951660194A66}.Release|x86.Build.0 = Release|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|x86.Build.0 = Debug|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|Any CPU.Build.0 = Release|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|x86.ActiveCfg = Release|Any CPU
+ {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/Lucene.Net.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.sln b/Lucene.Net.sln
index 669a57d..be5b2b9 100644
--- a/Lucene.Net.sln
+++ b/Lucene.Net.sln
@@ -94,6 +94,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.ICU", "src\Lucen
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.ICU", "src\Lucene.Net.Tests.ICU\Lucene.Net.Tests.ICU.csproj", "{D5AA1A22-1B28-4DF6-BFDA-02519A189839}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analysis.SmartCn", "src\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.csproj", "{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Analysis.SmartCn", "src\Lucene.Net.Tests.Analysis.SmartCn\Lucene.Net.Tests.Analysis.SmartCn.csproj", "{8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -903,6 +907,54 @@ Global
{D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
{D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.ActiveCfg = Release|Any CPU
{D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.Build.0 = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|x86.Build.0 = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Any CPU.Build.0 = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|x86.ActiveCfg = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|x86.Build.0 = Debug|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Any CPU.Build.0 = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|x86.ActiveCfg = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|x86.Build.0 = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Any CPU.ActiveCfg = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Any CPU.Build.0 = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|x86.ActiveCfg = Release|Any CPU
+ {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|x86.Build.0 = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|x86.Build.0 = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Any CPU.Build.0 = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|x86.ActiveCfg = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|x86.Build.0 = Debug|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Any CPU.Build.0 = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Mixed Platforms.Build.0 = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|x86.ActiveCfg = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|x86.Build.0 = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Any CPU.ActiveCfg = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Any CPU.Build.0 = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Mixed Platforms.Build.0 = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|x86.ActiveCfg = Release|Any CPU
+ {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
new file mode 100644
index 0000000..88c6c27
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
@@ -0,0 +1,183 @@
+using System;
+using System.IO;
+using System.Security;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Manages analysis data configuration for <see cref="SmartChineseAnalyzer"/>
+ /// <para/>
+ /// <see cref="SmartChineseAnalyzer"/> has a built-in dictionary and stopword list out-of-box.
+ /// <para/>
+ /// NOTE: To use an alternate dicationary than the built-in one, put the "bigramdict.dct" and
+ /// "coredict.dct" files in a subdirectory of your application named "analysis-data". This subdirectory
+ /// can be placed in any directory up to and including the root directory (if the OS permission allows).
+ /// To place the files in an alternate location, set an environment variable named "analysis.data.dir"
+ /// with the name of the directory the "bigramdict.dct" and "coredict.dct" files can be located within.
+ /// <para/>
+ /// The default "bigramdict.dct" and "coredict.dct" files can be found at:
+ /// <a href="https://issues.apache.org/jira/browse/LUCENE-1629">https://issues.apache.org/jira/browse/LUCENE-1629</a>.
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ public class AnalyzerProfile
+ {
+ /// <summary>
+ /// Global indicating the configured analysis data directory
+ /// </summary>
+ public static string ANALYSIS_DATA_DIR = "";
+
+ static AnalyzerProfile()
+ {
+ Init();
+ }
+
+ // LUCENENET specific - changed the logic here to leave the
+ // ANALYSIS_DATA_DIR an empty string if it is not found. This
+ // allows us to skip loading files from disk if there are no files
+ // to load (and fixes LUCENE-1817 that prevents the on-disk files
+ // from ever being loaded).
+ private static void Init()
+ {
+#if NETSTANDARD
+ // Support for GB2312 encoding. See: https://docs.microsoft.com/en-us/dotnet/api/system.text.codepagesencodingprovider?view=netcore-2.0
+ var encodingProvider = System.Text.CodePagesEncodingProvider.Instance;
+ System.Text.Encoding.RegisterProvider(encodingProvider);
+#endif
+
+ string dirName = "analysis-data";
+ //string propName = "analysis.properties";
+
+ // Try the system property:-Danalysis.data.dir=/path/to/analysis-data
+ //ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
+ ANALYSIS_DATA_DIR = GetSystemProperty("analysis.data.dir", "");
+ if (ANALYSIS_DATA_DIR.Length != 0)
+ return;
+
+#if NETSTANDARD
+ string currentPath = System.AppContext.BaseDirectory;
+#else
+ string currentPath = AppDomain.CurrentDomain.BaseDirectory;
+#endif
+
+ //FileInfo[] cadidateFiles = new FileInfo[] { new FileInfo(currentPath + "/" + dirName),
+ // new FileInfo(currentPath + "/bin/" + dirName)/*, new FileInfo("./" + propName),
+ // new FileInfo("./lib/" + propName)*/ };
+ //for (int i = 0; i < cadidateFiles.Length; i++)
+ //{
+ // FileInfo file = cadidateFiles[i];
+ // if (file.Exists)
+ // {
+ // ANALYSIS_DATA_DIR = file.FullName;
+
+ // //if (file.isDirectory())
+ // //{
+ // // ANALYSIS_DATA_DIR = file.getAbsolutePath();
+ // //}
+ // //else if (file.isFile() && GetAnalysisDataDir(file).Length != 0)
+ // //{
+ // // ANALYSIS_DATA_DIR = GetAnalysisDataDir(file);
+ // //}
+ // break;
+ // }
+ //}
+
+ string candidatePath = System.IO.Path.Combine(currentPath, dirName);
+ if (Directory.Exists(candidatePath))
+ {
+ ANALYSIS_DATA_DIR = candidatePath;
+ return;
+ }
+
+
+ try
+ {
+ while (new DirectoryInfo(currentPath).Parent != null)
+ {
+ candidatePath = System.IO.Path.Combine(new DirectoryInfo(currentPath).Parent.FullName, dirName);
+ if (Directory.Exists(candidatePath))
+ {
+ ANALYSIS_DATA_DIR = candidatePath;
+ return;
+ }
+ currentPath = new DirectoryInfo(currentPath).Parent.FullName;
+ }
+ }
+ catch (SecurityException)
+ {
+ // ignore security errors
+ }
+
+
+ //for (int i = 0; i < cadidateDirectories.Count; i++)
+ //{
+ // DirectoryInfo dir = cadidateDirectories[i];
+ // if (dir.Exists)
+ // {
+ // ANALYSIS_DATA_DIR = dir.FullName;
+ // break;
+ // }
+ //}
+
+ //if (ANALYSIS_DATA_DIR.Length == 0)
+ //{
+ // // Dictionary directory cannot be found.
+ // throw new Exception("WARNING: Can not find lexical dictionary directory!"
+ // + " This will cause unpredictable exceptions in your application!"
+ // + " Please refer to the manual to download the dictionaries.");
+ //}
+
+ }
+
+ //private static string GetAnalysisDataDir(FileInfo propFile)
+ //{
+ // Properties prop = new Properties();
+ // try
+ // {
+ // string dir;
+ // using (FileStream input = new FileStream(propFile.FullName, FileMode.Open, FileAccess.Read))
+ // {
+ // prop.load(new StreamReader(input, Encoding.UTF8));
+ // dir = prop.getProperty("analysis.data.dir", "");
+ // }
+ // return dir;
+ // }
+ // catch (IOException e)
+ // {
+ // return "";
+ // }
+ //}
+
+ private static string GetSystemProperty(string property, string defaultValue)
+ {
+ string setting;
+ try
+ {
+ setting = Environment.GetEnvironmentVariable(property);
+ }
+ catch (SecurityException)
+ {
+ setting = null;
+ }
+
+ return (setting == null) ? defaultValue : setting;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/CharType.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/CharType.cs b/src/Lucene.Net.Analysis.SmartCn/CharType.cs
new file mode 100644
index 0000000..8360802
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/CharType.cs
@@ -0,0 +1,67 @@
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Internal <see cref="SmartChineseAnalyzer"/> character type constants.
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ public enum CharType
+ {
+ /// <summary>
+ /// Punctuation Characters
+ /// </summary>
+ DELIMITER = 0,
+
+ /// <summary>
+ /// Letters
+ /// </summary>
+ LETTER = 1,
+
+ /// <summary>
+ /// Numeric Digits
+ /// </summary>
+ DIGIT = 2,
+
+ /// <summary>
+ /// Han Ideographs
+ /// </summary>
+ HANZI = 3,
+
+ /// <summary>
+ /// Characters that act as a space
+ /// </summary>
+ SPACE_LIKE = 4,
+
+ /// <summary>
+ /// Full-Width letters
+ /// </summary>
+ FULLWIDTH_LETTER = 5,
+
+ /// <summary>
+ /// Full-Width alphanumeric characters
+ /// </summary>
+ FULLWIDTH_DIGIT = 6,
+
+ /// <summary>
+ /// Other (not fitting any of the other categories)
+ /// </summary>
+ OTHER = 7
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
new file mode 100644
index 0000000..efac7d0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
@@ -0,0 +1,224 @@
+using System;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// <para>
+ /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation.
+ /// </para>
+ /// <para>
+ /// Contains methods for dealing with GB2312 encoding.
+ /// </para>
+ /// @lucene.experimental
+ /// </summary>
+ internal abstract class AbstractDictionary
+ {
+ /// <summary>
+ /// First Chinese Character in GB2312 (15 * 94)
+ /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
+ /// </summary>
+ public static readonly int GB2312_FIRST_CHAR = 1410;
+
+ /// <summary>
+ /// Last Chinese Character in GB2312 (87 * 94).
+ /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
+ /// </summary>
+ public static readonly int GB2312_CHAR_NUM = 87 * 94;
+
+ /// <summary>
+ /// Dictionary data contains 6768 Chinese characters with frequency statistics.
+ /// </summary>
+ public static readonly int CHAR_NUM_IN_FILE = 6768;
+
+ // =====================================================
+ // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
+ // B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
+ // B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
+ // B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
+ // B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
+ // B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
+ // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
+ // =====================================================
+ //
+ // GB2312 character set:
+ // 01 94 Symbols
+ // 02 72 Numbers
+ // 03 94 Latin
+ // 04 83 Kana
+ // 05 86 Katakana
+ // 06 48 Greek
+ // 07 66 Cyrillic
+ // 08 63 Phonetic Symbols
+ // 09 76 Drawing Symbols
+ // 10-15 Unassigned
+ // 16-55 3755 Plane 1, in pinyin order
+ // 56-87 3008 Plane 2, in radical/stroke order
+ // 88-94 Unassigned
+ // ======================================================
+
+ /// <summary>
+ /// <para>
+ /// Transcode from GB2312 ID to Unicode
+ /// </para>
+ /// <para>
+ /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
+ /// Some regions are unassigned (reserved).
+ /// </para>
+ /// </summary>
+ /// <param name="ccid">GB2312 id</param>
+ /// <returns>unicode String</returns>
+ public virtual string GetCCByGB2312Id(int ccid)
+ {
+ if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM)
+ return "";
+ int cc1 = ccid / 94 + 161;
+ int cc2 = ccid % 94 + 161;
+ byte[] buffer = new byte[2];
+ buffer[0] = (byte)cc1;
+ buffer[1] = (byte)cc2;
+ try
+ {
+ //String cchar = new String(buffer, "GB2312");
+ string cchar = Encoding.GetEncoding("GB2312").GetString(buffer);
+ return cchar;
+ }
+ catch (ArgumentException) // Encoding is not supported by the platform
+ {
+ return "";
+ }
+ }
+
+ /// <summary>
+ /// Transcode from Unicode to GB2312
+ /// </summary>
+ /// <param name="ch">input character in Unicode, or character in Basic Latin range.</param>
+ /// <returns>position in GB2312</returns>
+ public virtual short GetGB2312Id(char ch)
+ {
+ try
+ {
+ //byte[] buffer = Character.ToString(ch).getBytes("GB2312");
+ byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
+ //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
+ if (buffer.Length != 2)
+ {
+ // Should be a two-byte character
+ return -1;
+ }
+ int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
+ int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
+ // Therefore, each code page only has 16*6-2=94 characters.
+ return (short)(b0 * 94 + b1);
+ }
+ catch (ArgumentException e) // Encoding is not supported by the platform
+ {
+ throw new Exception(e.ToString(), e);
+ }
+ }
+
+ /// <summary>
+ /// 32-bit FNV Hash Function
+ /// </summary>
+ /// <param name="c">input character</param>
+ /// <returns>hashcode</returns>
+ public virtual long Hash1(char c)
+ {
+ long p = 1099511628211L;
+ long hash = unchecked((long)0xcbf29ce484222325L);
+ hash = (hash ^ (c & 0x00FF)) * p;
+ hash = (hash ^ (c >> 8)) * p;
+ hash += hash << 13;
+ hash ^= hash >> 7;
+ hash += hash << 3;
+ hash ^= hash >> 17;
+ hash += hash << 5;
+ return hash;
+ }
+
+ /// <summary>
+ /// 32-bit FNV Hash Function
+ /// </summary>
+ /// <param name="carray">character array</param>
+ /// <returns>hashcode</returns>
+ public virtual long Hash1(char[] carray)
+ {
+ long p = 1099511628211L;
+ long hash = unchecked((long)0xcbf29ce484222325L);
+ for (int i = 0; i < carray.Length; i++)
+ {
+ char d = carray[i];
+ hash = (hash ^ (d & 0x00FF)) * p;
+ hash = (hash ^ (d >> 8)) * p;
+ }
+
+ // hash += hash << 13;
+ // hash ^= hash >> 7;
+ // hash += hash << 3;
+ // hash ^= hash >> 17;
+ // hash += hash << 5;
+ return hash;
+ }
+
+ /// <summary>
+ /// djb2 hash algorithm,this algorithm (k=33) was first reported by dan
+ /// bernstein many years ago in comp.lang.c. another version of this algorithm
+ /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+ /// the magic of number 33 (why it works better than many other constants,
+ /// prime or not) has never been adequately explained.
+ /// </summary>
+ /// <param name="c">character</param>
+ /// <returns>hashcode</returns>
+ public virtual int Hash2(char c)
+ {
+ int hash = 5381;
+
+ /* hash 33 + c */
+ hash = ((hash << 5) + hash) + c & 0x00FF;
+ hash = ((hash << 5) + hash) + c >> 8;
+
+ return hash;
+ }
+
+ /// <summary>
+ /// djb2 hash algorithm,this algorithm (k=33) was first reported by dan
+ /// bernstein many years ago in comp.lang.c. another version of this algorithm
+ /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+ /// the magic of number 33 (why it works better than many other constants,
+ /// prime or not) has never been adequately explained.
+ /// </summary>
+ /// <param name="carray">character array</param>
+ /// <returns>hashcode</returns>
+ public virtual int Hash2(char[] carray)
+ {
+ int hash = 5381;
+
+ /* hash 33 + c */
+ for (int i = 0; i < carray.Length; i++)
+ {
+ char d = carray[i];
+ hash = ((hash << 5) + hash) + d & 0x00FF;
+ hash = ((hash << 5) + hash) + d >> 8;
+ }
+
+ return hash;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
new file mode 100644
index 0000000..adeef2a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
@@ -0,0 +1,256 @@
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Graph representing possible token pairs (bigrams) at each start offset in the sentence.
+ /// <para>
+ /// For each start offset, a list of possible token pairs is stored.
+ /// </para>
+ /// @lucene.experimental
+ /// </summary>
+ internal class BiSegGraph
+ {
+ private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>();
+
+ private IList<SegToken> segTokenList;
+
+ private static BigramDictionary bigramDict = BigramDictionary.GetInstance();
+
+ public BiSegGraph(SegGraph segGraph)
+ {
+ segTokenList = segGraph.MakeIndex();
+ GenerateBiSegGraph(segGraph);
+ }
+
+ /// <summary>
+ /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
+ /// </summary>
+ private void GenerateBiSegGraph(SegGraph segGraph)
+ {
+ double smooth = 0.1;
+ int wordPairFreq = 0;
+ int maxStart = segGraph.MaxStart;
+ double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
+
+ int next;
+ char[] idBuffer;
+ // get the list of tokens ordered and indexed
+ segTokenList = segGraph.MakeIndex();
+ // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
+ int key = -1;
+ IList<SegToken> nextTokens = null;
+ while (key < maxStart)
+ {
+ if (segGraph.IsStartExist(key))
+ {
+
+ IList<SegToken> tokenList = segGraph.GetStartList(key);
+
+ // Calculate all tokens for a given key.
+ foreach (SegToken t1 in tokenList)
+ {
+ oneWordFreq = t1.Weight;
+ next = t1.EndOffset;
+ nextTokens = null;
+ // Find the next corresponding Token.
+ // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
+ // If we cannot find the next Token, then go to the end and repeat the same cycle.
+ while (next <= maxStart)
+ {
+ // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
+ if (segGraph.IsStartExist(next))
+ {
+ nextTokens = segGraph.GetStartList(next);
+ break;
+ }
+ next++;
+ }
+ if (nextTokens == null)
+ {
+ break;
+ }
+ foreach (SegToken t2 in nextTokens)
+ {
+ idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
+ System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
+ idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
+ System.Array.Copy(t2.CharArray, 0, idBuffer,
+ t1.CharArray.Length + 1, t2.CharArray.Length);
+
+ // Two linked Words frequency
+ wordPairFreq = bigramDict.GetFrequency(idBuffer);
+
+ // Smoothing
+
+ // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
+ weight = -Math
+ .Log(smooth
+ * (1.0 + oneWordFreq)
+ / (Utility.MAX_FREQUENCE + 0.0)
+ + (1.0 - smooth)
+ * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
+
+ SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
+ t2.Index, weight);
+ this.AddSegTokenPair(tokenPair);
+ }
+ }
+ }
+ key++;
+ }
+
+ }
+
+ /// <summary>
+ /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token)
+ /// </summary>
+ /// <param name="to">index of the second token in the token pair</param>
+ /// <returns><c>true</c> if a token pair exists</returns>
+ public virtual bool IsToExist(int to)
+ {
+ //return tokenPairListTable.get(Integer.valueOf(to)) != null;
+ //return tokenPairListTable.ContainsKey(to) && tokenPairListTable[to] != null;
+ IList<SegTokenPair> result;
+ return tokenPairListTable.TryGetValue(to, out result) && result != null;
+ }
+
+ /// <summary>
+ /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token)
+ /// </summary>
+ /// <param name="to">index of the second token in the token pair</param>
+ /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns>
+ public virtual IList<SegTokenPair> GetToList(int to)
+ {
+ IList<SegTokenPair> result;
+ tokenPairListTable.TryGetValue(to, out result);
+ return result;
+ }
+
+ /// <summary>
+ /// Add a <see cref="SegTokenPair"/>
+ /// </summary>
+ /// <param name="tokenPair"><see cref="SegTokenPair"/></param>
+ public virtual void AddSegTokenPair(SegTokenPair tokenPair)
+ {
+ int to = tokenPair.To;
+ if (!IsToExist(to))
+ {
+ List<SegTokenPair> newlist = new List<SegTokenPair>();
+ newlist.Add(tokenPair);
+ tokenPairListTable[to] = newlist;
+ }
+ else
+ {
+ IList<SegTokenPair> tokenPairList = tokenPairListTable[to];
+ tokenPairList.Add(tokenPair);
+ }
+ }
+
+ /// <summary>
+ /// Get the number of <see cref="SegTokenPair"/> entries in the table.
+ /// </summary>
+ /// <returns>number of <see cref="SegTokenPair"/> entries</returns>
+ public virtual int ToCount
+ {
+ get { return tokenPairListTable.Count; }
+ }
+
+ /// <summary>
+ /// Find the shortest path with the Viterbi algorithm.
+ /// </summary>
+ /// <returns><see cref="T:IList{SegToken}"/></returns>
+ [ExceptionToNetNumericConvention]
+ public virtual IList<SegToken> GetShortPath()
+ {
+ int current;
+ int nodeCount = ToCount;
+ IList<PathNode> path = new List<PathNode>();
+ PathNode zeroPath = new PathNode();
+ zeroPath.Weight = 0;
+ zeroPath.PreNode = 0;
+ path.Add(zeroPath);
+ for (current = 1; current <= nodeCount; current++)
+ {
+ double weight;
+ IList<SegTokenPair> edges = GetToList(current);
+
+ double minWeight = double.MaxValue;
+ SegTokenPair minEdge = null;
+ foreach (SegTokenPair edge in edges)
+ {
+ weight = edge.Weight;
+ PathNode preNode2 = path[edge.From];
+ if (preNode2.Weight + weight < minWeight)
+ {
+ minWeight = preNode2.Weight + weight;
+ minEdge = edge;
+ }
+ }
+ PathNode newNode = new PathNode();
+ newNode.Weight = minWeight;
+ newNode.PreNode = minEdge.From;
+ path.Add(newNode);
+ }
+
+ // Calculate PathNodes
+ int preNode, lastNode;
+ lastNode = path.Count - 1;
+ current = lastNode;
+ IList<int> rpath = new List<int>();
+ IList<SegToken> resultPath = new List<SegToken>();
+
+ rpath.Add(current);
+ while (current != 0)
+ {
+ PathNode currentPathNode = path[current];
+ preNode = currentPathNode.PreNode;
+ rpath.Add(preNode);
+ current = preNode;
+ }
+ for (int j = rpath.Count - 1; j >= 0; j--)
+ {
+ //int idInteger = rpath.get(j);
+ //int id = idInteger.intValue();
+ int id = rpath[j];
+ SegToken t = segTokenList[id];
+ resultPath.Add(t);
+ }
+ return resultPath;
+ }
+
+ public override string ToString()
+ {
+ StringBuilder sb = new StringBuilder();
+ ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values;
+ foreach (IList<SegTokenPair> segList in values)
+ {
+ foreach (SegTokenPair pair in segList)
+ {
+ sb.Append(pair + "\n");
+ }
+ }
+ return sb.ToString();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
new file mode 100644
index 0000000..cc87ceb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
@@ -0,0 +1,431 @@
+using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
+using System;
+using System.IO;
+using System.Reflection;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// SmartChineseAnalyzer Bigram dictionary.
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ internal class BigramDictionary : AbstractDictionary
+ {
+ private BigramDictionary()
+ {
+ }
+
+ public static readonly char WORD_SEGMENT_CHAR = '@';
+
+ private static BigramDictionary singleInstance;
+
+ public static readonly int PRIME_BIGRAM_LENGTH = 402137;
+
+ /// <summary>
+ /// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
+ /// </summary>
+ private long[] bigramHashTable;
+
+ private int[] frequencyTable;
+
+ private int max = 0;
+
+ private int repeat = 0;
+
+ // static Logger log = Logger.getLogger(BigramDictionary.class);
+
+ private static object syncLock = new object();
+
+ public static BigramDictionary GetInstance()
+ {
+ lock (syncLock)
+ {
+ if (singleInstance == null)
+ {
+ singleInstance = new BigramDictionary();
+
+ // LUCENENET specific
+ // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817
+ // This issue still existed as of 4.8.0. Here is the fix - we only
+ // load from a directory if the actual directory exists (AnalyzerProfile
+ // ensures it is an empty string if it is not available).
+ string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+ if (string.IsNullOrEmpty(dictRoot))
+ {
+ singleInstance.Load();
+ }
+ else
+ {
+ singleInstance.Load(dictRoot);
+ }
+
+
+ //try
+ //{
+ // singleInstance.Load();
+ //}
+ //catch (IOException e)
+ //{
+ // string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+ // singleInstance.Load(dictRoot);
+ //}
+ //catch (TypeLoadException e)
+ //{
+ // throw new Exception(e.ToString(), e);
+ //}
+ }
+ return singleInstance;
+ }
+ }
+
+ private bool LoadFromObj(FileInfo serialObj)
+ {
+ try
+ {
+ using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read))
+ LoadFromInputStream(input);
+ return true;
+ }
+ catch (Exception e)
+ {
+ throw new Exception(e.ToString(), e);
+ }
+ }
+
+ // LUCENENET conversion note:
+ // The data in Lucene is stored in a proprietary binary format (similar to
+ // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the
+ // data was extracted using Java's DataOutputStream using the following Java code.
+ // It can then be read in using the LoadFromInputStream method below
+ // (using a DataInputStream instead of a BinaryReader), and saved
+ // in the correct (BinaryWriter) format by calling the SaveToObj method.
+ // Alternatively, the data can be loaded from disk using the files
+ // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file,
+ // which will automatically produce the .mem files.
+
+ //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException
+ //{
+ // // save wordIndexTable
+ // int wiLen = wordIndexTable.length;
+ // stream.writeInt(wiLen);
+ // for (int i = 0; i<wiLen; i++)
+ // {
+ // stream.writeShort(wordIndexTable[i]);
+ // }
+
+ // // save charIndexTable
+ // int ciLen = charIndexTable.length;
+ // stream.writeInt(ciLen);
+ // for (int i = 0; i<ciLen; i++)
+ // {
+ // stream.writeChar(charIndexTable[i]);
+ // }
+
+ // int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length;
+ // stream.writeInt(caDim1);
+ // for (int i = 0; i<caDim1; i++)
+ // {
+ // int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length;
+ // stream.writeInt(caDim2);
+ // for (int j = 0; j<caDim2; j++)
+ // {
+ // int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length;
+ // stream.writeInt(caDim3);
+ // for (int k = 0; k<caDim3; k++)
+ // {
+ // stream.writeChar(wordItem_charArrayTable[i][j][k]);
+ // }
+ // }
+ // }
+
+ // int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length;
+ // stream.writeInt(fDim1);
+ // for (int i = 0; i<fDim1; i++)
+ // {
+ // int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length;
+ // stream.writeInt(fDim2);
+ // for (int j = 0; j<fDim2; j++)
+ // {
+ // stream.writeInt(wordItem_frequencyTable[i][j]);
+ // }
+ // }
+ //}
+
+ private void LoadFromInputStream(Stream serialObjectInputStream)
+ {
+ //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
+ //bigramHashTable = (long[])input.readObject();
+ //frequencyTable = (int[])input.readObject();
+ //// log.info("load bigram dict from serialization.");
+ //input.close();
+
+ using (var reader = new BinaryReader(serialObjectInputStream))
+ //using (var reader = new DataInputStream(serialObjectInputStream))
+ {
+ // Read bigramHashTable
+ int bhLen = reader.ReadInt32();
+ bigramHashTable = new long[bhLen];
+ for (int i = 0; i < bhLen; i++)
+ {
+ bigramHashTable[i] = reader.ReadInt64();
+ }
+
+ // Read frequencyTable
+ int fLen = reader.ReadInt32();
+ frequencyTable = new int[fLen];
+ for (int i = 0; i < fLen; i++)
+ {
+ frequencyTable[i] = reader.ReadInt32();
+ }
+ }
+
+ // log.info("load bigram dict from serialization.");
+ }
+
+ private void SaveToObj(FileInfo serialObj)
+ {
+ try
+ {
+ //ObjectOutputStream output = new ObjectOutputStream(new FileStream(
+ // serialObj.FullName, FileMode.Create, FileAccess.Write));
+ //output.writeObject(bigramHashTable);
+ //output.writeObject(frequencyTable);
+ //output.close();
+
+ using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write))
+ {
+ using (BinaryWriter writer = new BinaryWriter(output))
+ {
+ int bhLen = bigramHashTable.Length;
+ writer.Write(bhLen);
+ for (int i = 0; i < bhLen; i++)
+ {
+ writer.Write(bigramHashTable[i]);
+ }
+
+ int fLen = frequencyTable.Length;
+ writer.Write(fLen);
+ for (int i = 0; i < fLen; i++)
+ {
+ writer.Write(frequencyTable[i]);
+ }
+ }
+ }
+ // log.info("serialize bigram dict.");
+ }
+#pragma warning disable 168
+ catch (Exception e)
+#pragma warning restore 168
+ {
+ // log.warn(e.getMessage());
+ }
+ }
+
+ private void Load()
+ {
+ using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem"))
+ {
+ LoadFromInputStream(input);
+ }
+ }
+
+ private void Load(string dictRoot)
+ {
+ string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct");
+
+ FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem"));
+
+ if (serialObj.Exists && LoadFromObj(serialObj))
+ {
+
+ }
+ else
+ {
+ try
+ {
+ bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
+ frequencyTable = new int[PRIME_BIGRAM_LENGTH];
+ for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++)
+ {
+ // it is possible for a value to hash to 0, but the probability is extremely low
+ bigramHashTable[i] = 0;
+ frequencyTable[i] = 0;
+ }
+ LoadFromFile(bigramDictPath);
+ }
+ catch (IOException e)
+ {
+ throw new Exception(e.ToString(), e);
+ }
+ SaveToObj(serialObj);
+ }
+ }
+
+ /// <summary>
+ /// Load the datafile into this <see cref="BigramDictionary"/>
+ /// </summary>
+ /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param>
+ /// <exception cref="IOException">If there is a low-level I/O error</exception>
+ public virtual void LoadFromFile(string dctFilePath)
+ {
+ int i, cnt, length, total = 0;
+ // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
+ // The 3756th is used (as a header) to store information.
+ int[]
+ buffer = new int[3];
+ byte[] intBuffer = new byte[4];
+ string tmpword;
+ //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"))
+ using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read))
+ {
+
+ // GB2312 characters 0 - 6768
+ for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
+ {
+ string currentStr = GetCCByGB2312Id(i);
+ // if (i == 5231)
+ // System.out.println(i);
+
+ dctFile.Read(intBuffer, 0, intBuffer.Length);
+ // the dictionary was developed for C, and byte order must be converted to work with Java
+ cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32();
+ if (cnt <= 0)
+ {
+ continue;
+ }
+ total += cnt;
+ int j = 0;
+ while (j < cnt)
+ {
+ dctFile.Read(intBuffer, 0, intBuffer.Length);
+ buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+ .GetInt32();// frequency
+ dctFile.Read(intBuffer, 0, intBuffer.Length);
+ buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+ .GetInt32();// length
+ dctFile.Read(intBuffer, 0, intBuffer.Length);
+ // buffer[2] = ByteBuffer.wrap(intBuffer).order(
+ // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+
+ length = buffer[1];
+ if (length > 0)
+ {
+ byte[] lchBuffer = new byte[length];
+ dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+ //tmpword = new String(lchBuffer, "GB2312");
+ tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+ //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
+ if (i != 3755 + GB2312_FIRST_CHAR)
+ {
+ tmpword = currentStr + tmpword;
+ }
+ char[] carray = tmpword.ToCharArray();
+ long hashId = Hash1(carray);
+ int index = GetAvaliableIndex(hashId, carray);
+ if (index != -1)
+ {
+ if (bigramHashTable[index] == 0)
+ {
+ bigramHashTable[index] = hashId;
+ // bigramStringTable[index] = tmpword;
+ }
+ frequencyTable[index] += buffer[0];
+ }
+ }
+ j++;
+ }
+ }
+ }
+ // log.info("load dictionary done! " + dctFilePath + " total:" + total);
+ }
+
+ private int GetAvaliableIndex(long hashId, char[] carray)
+ {
+ int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
+ int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
+ if (hash1 < 0)
+ hash1 = PRIME_BIGRAM_LENGTH + hash1;
+ if (hash2 < 0)
+ hash2 = PRIME_BIGRAM_LENGTH + hash2;
+ int index = hash1;
+ int i = 1;
+ while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+ && i < PRIME_BIGRAM_LENGTH)
+ {
+ index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+ i++;
+ }
+ // System.out.println(i - 1);
+
+ if (i < PRIME_BIGRAM_LENGTH
+ && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId))
+ {
+ return index;
+ }
+ else
+ return -1;
+ }
+
+ /// <summary>
+ /// lookup the index into the frequency array.
+ /// </summary>
+ private int GetBigramItemIndex(char[] carray)
+ {
+ long hashId = Hash1(carray);
+ int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
+ int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
+ if (hash1 < 0)
+ hash1 = PRIME_BIGRAM_LENGTH + hash1;
+ if (hash2 < 0)
+ hash2 = PRIME_BIGRAM_LENGTH + hash2;
+ int index = hash1;
+ int i = 1;
+ repeat++;
+ while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+ && i < PRIME_BIGRAM_LENGTH)
+ {
+ index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+ i++;
+ repeat++;
+ if (i > max)
+ max = i;
+ }
+ // System.out.println(i - 1);
+
+ if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId)
+ {
+ return index;
+ }
+ else
+ return -1;
+ }
+
+ public int GetFrequency(char[] carray)
+ {
+ int index = GetBigramItemIndex(carray);
+ if (index != -1)
+ return frequencyTable[index];
+ return 0;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
new file mode 100644
index 0000000..5d6ee55
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
@@ -0,0 +1,252 @@
+using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Finds the optimal segmentation of a sentence into Chinese words
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ public class HHMMSegmenter
+ {
+ private static WordDictionary wordDict = WordDictionary.GetInstance();
+
+ /// <summary>
+ /// Create the <see cref="SegGraph"/> for a sentence.
+ /// </summary>
+ /// <param name="sentence">input sentence, without start and end markers</param>
+ /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
+ private SegGraph CreateSegGraph(string sentence)
+ {
+ int i = 0, j;
+ int length = sentence.Length;
+ int foundIndex;
+ CharType[] charTypeArray = GetCharTypes(sentence);
+ StringBuilder wordBuf = new StringBuilder();
+ SegToken token;
+ int frequency = 0; // the number of times word appears.
+ bool hasFullWidth;
+ WordType wordType;
+ char[] charArray;
+
+ SegGraph segGraph = new SegGraph();
+ while (i < length)
+ {
+ hasFullWidth = false;
+ switch (charTypeArray[i])
+ {
+ case CharType.SPACE_LIKE:
+ i++;
+ break;
+ case CharType.HANZI:
+ j = i + 1;
+ //wordBuf.delete(0, wordBuf.length());
+ wordBuf.Remove(0, wordBuf.Length);
+ // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
+ // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
+ // cause word division.
+ wordBuf.Append(sentence[i]);
+ charArray = new char[] { sentence[i] };
+ frequency = wordDict.GetFrequency(charArray);
+ token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+ frequency);
+ segGraph.AddToken(token);
+
+ foundIndex = wordDict.GetPrefixMatch(charArray);
+ while (j <= length && foundIndex != -1)
+ {
+ if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
+ {
+ // It is the phrase we are looking for; In other words, we have found a phrase SegToken
+ // from i to j. It is not a monosyllabic word (single word).
+ frequency = wordDict.GetFrequency(charArray);
+ token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+ frequency);
+ segGraph.AddToken(token);
+ }
+
+ while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
+ j++;
+
+ if (j < length && charTypeArray[j] == CharType.HANZI)
+ {
+ wordBuf.Append(sentence[j]);
+ charArray = new char[wordBuf.Length];
+ //wordBuf.GetChars(0, charArray.Length, charArray, 0);
+ wordBuf.CopyTo(0, charArray, 0, charArray.Length);
+ // idArray has been found (foundWordIndex!=-1) as a prefix before.
+ // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
+ // So start searching after foundWordIndex.
+ foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
+ j++;
+ }
+ else
+ {
+ break;
+ }
+ }
+ i++;
+ break;
+ case CharType.FULLWIDTH_LETTER:
+ hasFullWidth = true; /* intentional fallthrough */
+
+ j = i + 1;
+ while (j < length
+ && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
+ {
+ if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+ hasFullWidth = true;
+ j++;
+ }
+ // Found a Token from i to j. Type is LETTER char string.
+ charArray = Utility.STRING_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
+ token = new SegToken(charArray, i, j, wordType, frequency);
+ segGraph.AddToken(token);
+ i = j;
+ break;
+
+ case CharType.LETTER:
+ j = i + 1;
+ while (j < length
+ && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
+ {
+ if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+ hasFullWidth = true;
+ j++;
+ }
+ // Found a Token from i to j. Type is LETTER char string.
+ charArray = Utility.STRING_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
+ token = new SegToken(charArray, i, j, wordType, frequency);
+ segGraph.AddToken(token);
+ i = j;
+ break;
+ case CharType.FULLWIDTH_DIGIT:
+ hasFullWidth = true; /* intentional fallthrough */
+
+ j = i + 1;
+ while (j < length
+ && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
+ {
+ if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+ hasFullWidth = true;
+ j++;
+ }
+ // Found a Token from i to j. Type is NUMBER char string.
+ charArray = Utility.NUMBER_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
+ token = new SegToken(charArray, i, j, wordType, frequency);
+ segGraph.AddToken(token);
+ i = j;
+ break;
+
+ case CharType.DIGIT:
+ j = i + 1;
+ while (j < length
+ && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
+ {
+ if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+ hasFullWidth = true;
+ j++;
+ }
+ // Found a Token from i to j. Type is NUMBER char string.
+ charArray = Utility.NUMBER_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
+ token = new SegToken(charArray, i, j, wordType, frequency);
+ segGraph.AddToken(token);
+ i = j;
+ break;
+ case CharType.DELIMITER:
+ j = i + 1;
+ // No need to search the weight for the punctuation. Picking the highest frequency will work.
+ frequency = Utility.MAX_FREQUENCE;
+ charArray = new char[] { sentence[i] };
+ token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
+ segGraph.AddToken(token);
+ i = j;
+ break;
+ default:
+ j = i + 1;
+ // Treat the unrecognized char symbol as unknown string.
+ // For example, any symbol not in GB2312 is treated as one of these.
+ charArray = Utility.STRING_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ token = new SegToken(charArray, i, j, WordType.STRING, frequency);
+ segGraph.AddToken(token);
+ i = j;
+ break;
+ }
+ }
+
+ // Add two more Tokens: "beginning xx beginning"
+ charArray = Utility.START_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
+ segGraph.AddToken(token);
+
+ // "end xx end"
+ charArray = Utility.END_CHAR_ARRAY;
+ frequency = wordDict.GetFrequency(charArray);
+ token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
+ frequency);
+ segGraph.AddToken(token);
+
+ return segGraph;
+ }
+
+ /// <summary>
+ /// Get the character types for every character in a sentence.
+ /// </summary>
+ /// <param name="sentence">input sentence</param>
+ /// <returns>array of character types corresponding to character positions in the sentence</returns>
+ /// <seealso cref="Utility.GetCharType(char)"/>
+ private static CharType[] GetCharTypes(string sentence)
+ {
+ int length = sentence.Length;
+ CharType[] charTypeArray = new CharType[length];
+ // the type of each character by position
+ for (int i = 0; i < length; i++)
+ {
+ charTypeArray[i] = Utility.GetCharType(sentence[i]);
+ }
+
+ return charTypeArray;
+ }
+
+ /// <summary>
+ /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence
+ /// </summary>
+ /// <param name="sentence">input sentence</param>
+ /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns>
+ public virtual IList<SegToken> Process(string sentence)
+ {
+ SegGraph segGraph = CreateSegGraph(sentence);
+ BiSegGraph biSegGraph = new BiSegGraph(segGraph);
+ IList<SegToken> shortPath = biSegGraph.GetShortPath();
+ return shortPath;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
new file mode 100644
index 0000000..11387ad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
@@ -0,0 +1,80 @@
+using Lucene.Net.Support;
+using System;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// SmartChineseAnalyzer internal node representation
+ /// <para>
+ /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the Viterbi algorithm.
+ /// </para>
+ /// @lucene.experimental
+ /// </summary>
+ internal class PathNode : IComparable<PathNode>
+ {
+ public double Weight { get; set; }
+
+ public int PreNode { get; set; }
+
+ public virtual int CompareTo(PathNode pn)
+ {
+ if (Weight < pn.Weight)
+ return -1;
+ else if (Weight == pn.Weight)
+ return 0;
+ else
+ return 1;
+ }
+
+ /// <summary>
+ /// <see cref="object.GetHashCode()"/>
+ /// </summary>
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = 1;
+ result = prime * result + PreNode;
+ long temp;
+ temp = Number.DoubleToInt64Bits(Weight);
+ result = prime * result + (int)(temp ^ (int)((uint)temp >> 32));
+ return result;
+ }
+
+ /// <summary>
+ /// <see cref="object.Equals(object)"/>
+ /// </summary>
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ PathNode other = (PathNode)obj;
+ if (PreNode != other.PreNode)
+ return false;
+ if (Number.DoubleToInt64Bits(Weight) != Number
+ .DoubleToInt64Bits(other.Weight))
+ return false;
+ return true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
new file mode 100644
index 0000000..e0138c1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
@@ -0,0 +1,160 @@
+using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Graph representing possible tokens at each start offset in the sentence.
+ /// <para>
+ /// For each start offset, a list of possible tokens is stored.
+ /// </para>
+ /// @lucene.experimental
+ /// </summary>
+ internal class SegGraph
+ {
+ /// <summary>
+ /// Map of start offsets to <see cref="T:IList{SegToken}"/> of tokens at that position
+ /// </summary>
+ private IDictionary<int, IList<SegToken>> tokenListTable = new Dictionary<int, IList<SegToken>>();
+
+ private int maxStart = -1;
+
+ /// <summary>
+ /// Returns <c>true</c> if a mapping for the specified start offset exists
+ /// </summary>
+ /// <param name="s">startOffset</param>
+ /// <returns><c>true</c> if there are tokens for the startOffset</returns>
+ public virtual bool IsStartExist(int s)
+ {
+ //return tokenListTable.get(s) != null;
+ IList<SegToken> result;
+ return tokenListTable.TryGetValue(s, out result) && result != null;
+ }
+
+ /// <summary>
+ /// Get the list of tokens at the specified start offset
+ /// </summary>
+ /// <param name="s">startOffset</param>
+ /// <returns><see cref="T:IList{SegToken}"/> of tokens at the specified start offset.</returns>
+ public virtual IList<SegToken> GetStartList(int s)
+ {
+ IList<SegToken> result;
+ tokenListTable.TryGetValue(s, out result);
+ return result;
+ }
+
+ /// <summary>
+ /// Get the highest start offset in the map. Returns maximum start offset, or -1 if the map is empty.
+ /// </summary>
+ public virtual int MaxStart
+ {
+ get { return maxStart; }
+ }
+
+ /// <summary>
+ /// Set the <see cref="SegToken.Index"/> for each token, based upon its order by startOffset.
+ /// </summary>
+ /// <returns>a <see cref="T:IList{SegToken}"/> of these ordered tokens.</returns>
+ public virtual IList<SegToken> MakeIndex()
+ {
+ IList<SegToken> result = new List<SegToken>();
+ int s = -1, count = 0, size = tokenListTable.Count;
+ IList<SegToken> tokenList;
+ int index = 0;
+ while (count < size)
+ {
+ if (IsStartExist(s))
+ {
+ tokenList = tokenListTable[s];
+ foreach (SegToken st in tokenList)
+ {
+ st.Index = index;
+ result.Add(st);
+ index++;
+ }
+ count++;
+ }
+ s++;
+ }
+ return result;
+ }
+
+ /// <summary>
+ /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist.
+ /// </summary>
+ /// <param name="token">token <see cref="SegToken"/>.</param>
+ public virtual void AddToken(SegToken token)
+ {
+ int s = token.StartOffset;
+ if (!IsStartExist(s))
+ {
+ List<SegToken> newlist = new List<SegToken>();
+ newlist.Add(token);
+ tokenListTable[s] = newlist;
+ }
+ else
+ {
+ IList<SegToken> tokenList = tokenListTable[s];
+ tokenList.Add(token);
+ }
+ if (s > maxStart)
+ {
+ maxStart = s;
+ }
+ }
+
+ /// <summary>
+ /// Return a <see cref="T:IList{SegToken}"/> of all tokens in the map, ordered by startOffset.
+ /// </summary>
+ /// <returns><see cref="T:IList{SegToken}"/> of all tokens in the map.</returns>
+ public virtual IList<SegToken> ToTokenList()
+ {
+ IList<SegToken> result = new List<SegToken>();
+ int s = -1, count = 0, size = tokenListTable.Count;
+ IList<SegToken> tokenList;
+
+ while (count < size)
+ {
+ if (IsStartExist(s))
+ {
+ tokenList = tokenListTable[s];
+ foreach (SegToken st in tokenList)
+ {
+ result.Add(st);
+ }
+ count++;
+ }
+ s++;
+ }
+ return result;
+ }
+
+ public override string ToString()
+ {
+ IList<SegToken> tokenList = this.ToTokenList();
+ StringBuilder sb = new StringBuilder();
+ foreach (SegToken t in tokenList)
+ {
+ sb.Append(t + "\n");
+ }
+ return sb.ToString();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
new file mode 100644
index 0000000..48ba8ce
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
@@ -0,0 +1,123 @@
+using Lucene.Net.Support;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// SmartChineseAnalyzer internal token
+ /// <para/>
+ /// @lucene.experimental
+ /// </summary>
+ public class SegToken
+ {
+ /// <summary>
+ /// Character array containing token text
+ /// </summary>
+ [WritableArray]
+ public char[] CharArray { get; set; }
+
+ /// <summary>
+ /// start offset into original sentence
+ /// </summary>
+ public int StartOffset { get; set; }
+
+ /// <summary>
+ /// end offset into original sentence
+ /// </summary>
+ public int EndOffset { get; set; }
+
+ /// <summary>
+ /// <see cref="Smart.WordType"/> of the text
+ /// </summary>
+ public WordType WordType { get; set; }
+
+ /// <summary>
+ /// word frequency
+ /// </summary>
+ public int Weight { get; set; }
+
+ /// <summary>
+ /// during segmentation, this is used to store the index of the token in the token list table
+ /// </summary>
+ public int Index { get; set; }
+
+ /// <summary>
+ /// Create a new <see cref="SegToken"/> from a character array.
+ /// </summary>
+ /// <param name="idArray">character array containing text</param>
+ /// <param name="start">start offset of <see cref="SegToken"/> in original sentence</param>
+ /// <param name="end">end offset of <see cref="SegToken"/> in original sentence</param>
+ /// <param name="wordType"><see cref="Smart.WordType"/> of the text</param>
+ /// <param name="weight">word frequency</param>
+ public SegToken(char[] idArray, int start, int end, WordType wordType, int weight)
+ {
+ this.CharArray = idArray;
+ this.StartOffset = start;
+ this.EndOffset = end;
+ this.WordType = wordType;
+ this.Weight = weight;
+ }
+
+ /// <summary>
+ /// <see cref="object.GetHashCode()"/>
+ /// </summary>
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = 1;
+ for (int i = 0; i < CharArray.Length; i++)
+ {
+ result = prime * result + CharArray[i];
+ }
+ result = prime * result + EndOffset;
+ result = prime * result + Index;
+ result = prime * result + StartOffset;
+ result = prime * result + Weight;
+ result = prime * result + (int)WordType;
+ return result;
+ }
+
+ /// <summary>
+ /// <see cref="object.Equals(object)"/>
+ /// </summary>
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ SegToken other = (SegToken)obj;
+ if (!Arrays.Equals(CharArray, other.CharArray))
+ return false;
+ if (EndOffset != other.EndOffset)
+ return false;
+ if (Index != other.Index)
+ return false;
+ if (StartOffset != other.StartOffset)
+ return false;
+ if (Weight != other.Weight)
+ return false;
+ if (WordType != other.WordType)
+ return false;
+ return true;
+ }
+ }
+}