You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/23 17:36:37 UTC
[12/13] lucenenet git commit: lucene-cli: Added command for Kuromoji
DictionaryBuilder tool + tests + documentation
lucene-cli: Added command for Kuromoji DictionaryBuilder tool + tests + documentation
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/bacfcc1a
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/bacfcc1a
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/bacfcc1a
Branch: refs/heads/master
Commit: bacfcc1adbe0fa46bbc5a3ba1d657258cb9c571d
Parents: 0f09201
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Mon Jul 24 00:13:23 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Mon Jul 24 00:35:28 2017 +0700
----------------------------------------------------------------------
...nalysisKuromojiBuildDictionaryCommandTest.cs | 104 +++++++++++++++++++
.../lucene-cli/Resources/Strings.Designer.cs | 54 ++++++++++
src/tools/lucene-cli/Resources/Strings.resx | 18 ++++
.../commands/analysis/AnalysisCommand.cs | 2 +-
.../AnalysisKuromojiBuildDictionaryCommand.cs | 95 +++++++++++++++++
src/tools/lucene-cli/docs/analysis/index.md | 1 +
.../docs/analysis/kuromoji-build-dictionary.md | 46 ++++++++
src/tools/lucene-cli/project.json | 3 +-
8 files changed, 321 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs
----------------------------------------------------------------------
diff --git a/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs b/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs
new file mode 100644
index 0000000..c8eaa41
--- /dev/null
+++ b/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs
@@ -0,0 +1,104 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Cli.CommandLine;
+using NUnit.Framework;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Cli.Commands
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class AnalysisKuromojiBuildDictionaryCommandTest : CommandTestCase
+ {
+ protected override ConfigurationBase CreateConfiguration(MockConsoleApp app)
+ {
+ return new AnalysisKuromojiBuildDictionaryCommand.Configuration(new CommandLineOptions()) { Main = (args) => app.Main(args) };
+ }
+
+ protected override IList<Arg[]> GetOptionalArgs()
+ {
+ // NOTE: We must order this in the sequence of the expected output.
+ return new List<Arg[]>()
+ {
+ new Arg[] { new Arg(inputPattern: "-e UTF-16|--encoding UTF-16", output: new string[] { "--encoding", "UTF-16" }) },
+ new Arg[] { new Arg(inputPattern: "-n|--normalize", output: new string[] { "true" }) },
+ };
+ }
+ protected override IList<Arg[]> GetRequiredArgs()
+ {
+ // NOTE: We must order this in the sequence of the expected output.
+ return new List<Arg[]>()
+ {
+ new Arg[] { new Arg(inputPattern: "epidic", output: new string[] { @"epidic" }) },
+ new Arg[] { new Arg(inputPattern: @"C:\lucene-input", output: new string[] { @"C:\lucene-input" }) },
+ new Arg[] { new Arg(inputPattern: @"C:\lucene-output", output: new string[] { @"C:\lucene-output" }) },
+ };
+ }
+
+ [Test]
+ [LuceneNetSpecific]
+ public override void TestAllValidCombinations()
+ {
+ var requiredArgs = GetRequiredArgs().ExpandArgs().RequiredParameters();
+ var optionalArgs = GetOptionalArgs().ExpandArgs().OptionalParameters();
+
+ foreach (var requiredArg in requiredArgs)
+ {
+ AssertCommandTranslation(
+ string.Join(" ", requiredArg.Select(x => x.InputPattern).ToArray()),
+ requiredArg.SelectMany(x => x.Output)
+
+ .Concat(new string[] {
+ // Special case: the encoding must always be supplied
+ "utf-8",
+ // Special case: normalize must always be supplied
+ "false"
+ }).ToArray());
+ }
+
+ foreach (var requiredArg in requiredArgs)
+ {
+ foreach (var optionalArg in optionalArgs)
+ {
+ string command = string.Join(" ", requiredArg.Select(x => x.InputPattern).Union(optionalArg.Select(x => x.InputPattern).ToArray()));
+ string[] expected = requiredArg.SelectMany(x => x.Output)
+ // Special case: the encoding must always be supplied
+ .Concat(Regex.IsMatch(command, "-e|--encoding") ? new string[] { "UTF-16" } : new string[] { "utf-8" })
+ // Special case: the encoding must always be supplied
+ .Concat(Regex.IsMatch(command, "-n|--normalize") ? new string[] { "true" } : new string[] { "false" }).ToArray();
+ AssertCommandTranslation(command, expected);
+ }
+ }
+ }
+
+ [Test]
+ [LuceneNetSpecific]
+ public virtual void TestNotEnoughArguments()
+ {
+ AssertConsoleOutput("one two", FromResource("NotEnoughArguments", 3));
+ }
+
+ [Test]
+ [LuceneNetSpecific]
+ public virtual void TestTooManyArguments()
+ {
+ Assert.Throws<CommandParsingException>(() => AssertConsoleOutput("one two three four", ""));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/Resources/Strings.Designer.cs
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/Resources/Strings.Designer.cs b/src/tools/lucene-cli/Resources/Strings.Designer.cs
index 5d1fa93..9af44ff 100644
--- a/src/tools/lucene-cli/Resources/Strings.Designer.cs
+++ b/src/tools/lucene-cli/Resources/Strings.Designer.cs
@@ -69,6 +69,60 @@ namespace Lucene.Net.Cli.Resources {
}
/// <summary>
+ /// Looks up a localized string similar to Builds a custom dictionary that can be used by the JapaneseAnalyzer or JapaneseTokenizer..
+ /// </summary>
+ public static string AnalysisKuromojiBuildDictionaryCommandDescription {
+ get {
+ return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandDescription", resourceCulture);
+ }
+ }
+
+ /// <summary>
+ /// Looks up a localized string similar to The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed..
+ /// </summary>
+ public static string AnalysisKuromojiBuildDictionaryCommandFormatDescription {
+ get {
+ return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandFormatDescription", resourceCulture);
+ }
+ }
+
+ /// <summary>
+ /// Looks up a localized string similar to The directory where the dictionary input files are located..
+ /// </summary>
+ public static string AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription {
+ get {
+ return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription", resourceCulture);
+ }
+ }
+
+ /// <summary>
+ /// Looks up a localized string similar to The file encoding used by the input files. If not supplied, the default value is `UTF-8`..
+ /// </summary>
+ public static string AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription {
+ get {
+ return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription", resourceCulture);
+ }
+ }
+
+ /// <summary>
+ /// Looks up a localized string similar to Normalize the entries using normalization form KC..
+ /// </summary>
+ public static string AnalysisKuromojiBuildDictionaryCommandNormalizeDescription {
+ get {
+ return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandNormalizeDescription", resourceCulture);
+ }
+ }
+
+ /// <summary>
+ /// Looks up a localized string similar to The directory to put the dictionary output..
+ /// </summary>
+ public static string AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription {
+ get {
+ return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription", resourceCulture);
+ }
+ }
+
+ /// <summary>
/// Looks up a localized string similar to Compiles a stemmer table for the Egothor stemmer..
/// </summary>
public static string AnalysisStempelCompileStemsCommandDescription {
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/Resources/Strings.resx
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/Resources/Strings.resx b/src/tools/lucene-cli/Resources/Strings.resx
index 64be738..727cb62 100644
--- a/src/tools/lucene-cli/Resources/Strings.resx
+++ b/src/tools/lucene-cli/Resources/Strings.resx
@@ -120,6 +120,24 @@
<data name="AnalysisCommandDescription" xml:space="preserve">
<value>Utilities to manage specialized analyzers.</value>
</data>
+ <data name="AnalysisKuromojiBuildDictionaryCommandDescription" xml:space="preserve">
+ <value>Builds a custom dictionary that can be used by the JapaneseAnalyzer or JapaneseTokenizer.</value>
+ </data>
+ <data name="AnalysisKuromojiBuildDictionaryCommandFormatDescription" xml:space="preserve">
+ <value>The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed.</value>
+ </data>
+ <data name="AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription" xml:space="preserve">
+ <value>The directory where the dictionary input files are located.</value>
+ </data>
+ <data name="AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription" xml:space="preserve">
+ <value>The file encoding used by the input files. If not supplied, the default value is `UTF-8`.</value>
+ </data>
+ <data name="AnalysisKuromojiBuildDictionaryCommandNormalizeDescription" xml:space="preserve">
+ <value>Normalize the entries using normalization form KC.</value>
+ </data>
+ <data name="AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription" xml:space="preserve">
+ <value>The directory to put the dictionary output.</value>
+ </data>
<data name="AnalysisStempelCompileStemsCommandDescription" xml:space="preserve">
<value>Compiles a stemmer table for the Egothor stemmer.</value>
</data>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs b/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
index 969bd58..a39eaeb 100644
--- a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
+++ b/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
@@ -27,7 +27,7 @@
this.Description = FromResource("Description");
//this.Commands.Add(new AnalysisICUBuildRBBIRulesCommand.Configuration(options));
- //this.Commands.Add(new AnalysisKuromojiBuildDictionaryCommand.Configuration(options));
+ this.Commands.Add(new AnalysisKuromojiBuildDictionaryCommand.Configuration(options));
this.Commands.Add(new AnalysisStempelCompileStemsCommand.Configuration(options));
this.Commands.Add(new AnalysisStempelPatchStemsCommand.Configuration(options));
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs b/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs
new file mode 100644
index 0000000..7f10ed7
--- /dev/null
+++ b/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs
@@ -0,0 +1,95 @@
+using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Cli.CommandLine;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Cli
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class AnalysisKuromojiBuildDictionaryCommand : ICommand
+ {
+ public class Configuration : ConfigurationBase
+ {
+ public Configuration(CommandLineOptions options)
+ {
+ this.Main = (args) => DictionaryBuilder.Main(args);
+
+ this.Name = "kuromoji-build-dictionary";
+ this.Description = FromResource("Description");
+
+ this.Format = this.Argument(
+ "<FORMAT>",
+ FromResource("FormatDescription"));
+ this.InputDirectory = this.Argument(
+ "<INPUT_DIRECTORY>",
+ FromResource("InputDirectoryDescription"));
+ this.OutputDirectory = this.Argument(
+ "<OUTPUT_DIRECTORY>",
+ FromResource("OutputDirectoryDescription"));
+ this.InputDirectoryEncoding = this.Option(
+ "-e|--encoding <ENCODING>",
+ FromResource("InputDirectoryEncodingDescription"),
+ CommandOptionType.SingleValue);
+ this.Normalize = this.Option(
+ "-n|--normalize",
+ FromResource("NormalizeDescription"),
+ CommandOptionType.NoValue);
+
+ this.OnExecute(() => new AnalysisKuromojiBuildDictionaryCommand().Run(this));
+ }
+
+ public virtual CommandArgument Format { get; private set; }
+ public virtual CommandArgument InputDirectory { get; private set; }
+ public virtual CommandArgument OutputDirectory { get; private set; }
+ public virtual CommandOption InputDirectoryEncoding { get; private set; }
+ public virtual CommandOption Normalize { get; private set; }
+ }
+
+ public int Run(ConfigurationBase cmd)
+ {
+ if (!cmd.ValidateArguments(3))
+ {
+ return 1;
+ }
+
+ var input = cmd as Configuration;
+ var args = new List<string>(input.GetNonNullArguments());
+
+ if (input.InputDirectoryEncoding.HasValue())
+ {
+ args.Add(input.InputDirectoryEncoding.Value());
+ }
+ else
+ {
+ args.Add("utf-8");
+ }
+
+ if (input.Normalize.HasValue())
+ {
+ args.Add("true");
+ }
+ else
+ {
+ args.Add("false");
+ }
+
+ cmd.Main(args.ToArray());
+ return 0;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/docs/analysis/index.md
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/docs/analysis/index.md b/src/tools/lucene-cli/docs/analysis/index.md
index c114294..9843805 100644
--- a/src/tools/lucene-cli/docs/analysis/index.md
+++ b/src/tools/lucene-cli/docs/analysis/index.md
@@ -6,5 +6,6 @@ Utilities to manage specialized analyzers.
## Commands
+- [kuromoji-build-dictionary](kuromoji-build-dictionary.md)
- [stempel-compile-stems](stempel-compile-stems.md)
- [stempel-patch-stems](stempel-patch-stems.md)
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md b/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md
new file mode 100644
index 0000000..9fd7cf6
--- /dev/null
+++ b/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md
@@ -0,0 +1,46 @@
+# kuromoji-build-dictionary
+
+### Name
+
+`analysis-kuromoji-build-dictionary` - Generates a dictionary file for the JapaneseAnalyzer or JapaneseTokenizer in the Lucene.Net.Analysis.Kuromoji project.
+
+### Synopsis
+
+<code>dotnet lucene-cli.dll analysis kuromoji-build-dictionary <FORMAT> <INPUT_DIRECTORY> <OUTPUT_DIRECTORY> [-e|--encoding] [-n|--normalize] [?|-h|--help]</code>
+
+### Description
+
+See the [Kuromoji project documentation](https://github.com/atilika/kuromoji) for more information.
+
+### Arguments
+
+`FORMAT`
+
+The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed.
+
+`INPUT_DIRECTORY`
+
+The directory where the dictionary input files are located.
+
+`OUTPUT_DIRECTORY`
+
+The directory to put the dictionary output.
+
+### Options
+
+`?|-h|--help`
+
+Prints out a short help for the command.
+
+`-e|--encoding <ENCODING>`
+
+The file encoding used by the input files. If not supplied, the default value is `UTF-8`.
+
+`-n|--normalize`
+
+Normalize the entries using normalization form KC.
+
+### Example
+
+<code>dotnet lucene-cli.dll analysis kuromoji-build-dictionary X:\kuromoji-data X:\kuromoji-dictionary --encoding UTF-16</code>
+
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/project.json
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/project.json b/src/tools/lucene-cli/project.json
index 219964d..767a705 100644
--- a/src/tools/lucene-cli/project.json
+++ b/src/tools/lucene-cli/project.json
@@ -1,4 +1,4 @@
-{
+{
"version": "4.8.0",
"entryPoint": "Program",
"buildOptions": {
@@ -19,6 +19,7 @@
"dependencies": {
"Lucene.Net": "4.8.0",
"Lucene.Net.Analysis.Common": "4.8.0",
+ "Lucene.Net.Analysis.Kuromoji": "4.8.0",
"Lucene.Net.Analysis.Stempel": "4.8.0",
"Lucene.Net.Demo": "4.8.0",
"Lucene.Net.Expressions": "4.8.0",