You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2017/07/23 17:36:37 UTC

[12/13] lucenenet git commit: lucene-cli: Added command for Kuromoji DictionaryBuilder tool + tests + documentation

lucene-cli: Added command for Kuromoji DictionaryBuilder tool + tests + documentation


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/bacfcc1a
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/bacfcc1a
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/bacfcc1a

Branch: refs/heads/master
Commit: bacfcc1adbe0fa46bbc5a3ba1d657258cb9c571d
Parents: 0f09201
Author: Shad Storhaug <sh...@shadstorhaug.com>
Authored: Mon Jul 24 00:13:23 2017 +0700
Committer: Shad Storhaug <sh...@shadstorhaug.com>
Committed: Mon Jul 24 00:35:28 2017 +0700

----------------------------------------------------------------------
 ...nalysisKuromojiBuildDictionaryCommandTest.cs | 104 +++++++++++++++++++
 .../lucene-cli/Resources/Strings.Designer.cs    |  54 ++++++++++
 src/tools/lucene-cli/Resources/Strings.resx     |  18 ++++
 .../commands/analysis/AnalysisCommand.cs        |   2 +-
 .../AnalysisKuromojiBuildDictionaryCommand.cs   |  95 +++++++++++++++++
 src/tools/lucene-cli/docs/analysis/index.md     |   1 +
 .../docs/analysis/kuromoji-build-dictionary.md  |  46 ++++++++
 src/tools/lucene-cli/project.json               |   3 +-
 8 files changed, 321 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs
----------------------------------------------------------------------
diff --git a/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs b/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs
new file mode 100644
index 0000000..c8eaa41
--- /dev/null
+++ b/src/tools/Lucene.Net.Tests.Cli/Commands/Analysis/AnalysisKuromojiBuildDictionaryCommandTest.cs
@@ -0,0 +1,104 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Cli.CommandLine;
+using NUnit.Framework;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Cli.Commands
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class AnalysisKuromojiBuildDictionaryCommandTest : CommandTestCase
+    {
+        protected override ConfigurationBase CreateConfiguration(MockConsoleApp app)
+        {
+            return new AnalysisKuromojiBuildDictionaryCommand.Configuration(new CommandLineOptions()) { Main = (args) => app.Main(args) };
+        }
+
+        protected override IList<Arg[]> GetOptionalArgs()
+        {
+            // NOTE: We must order this in the sequence of the expected output.
+            return new List<Arg[]>()
+            {
+                new Arg[] { new Arg(inputPattern: "-e UTF-16|--encoding UTF-16", output: new string[] { "--encoding", "UTF-16" }) },
+                new Arg[] { new Arg(inputPattern: "-n|--normalize", output: new string[] { "true" }) },
+            };
+        }
+        protected override IList<Arg[]> GetRequiredArgs()
+        {
+            // NOTE: We must order this in the sequence of the expected output.
+            return new List<Arg[]>()
+            {
+                new Arg[] { new Arg(inputPattern: "epidic", output: new string[] { @"epidic" }) },
+                new Arg[] { new Arg(inputPattern: @"C:\lucene-input", output: new string[] { @"C:\lucene-input" }) },
+                new Arg[] { new Arg(inputPattern: @"C:\lucene-output", output: new string[] { @"C:\lucene-output" }) },
+            };
+        }
+
+        [Test]
+        [LuceneNetSpecific]
+        public override void TestAllValidCombinations()
+        {
+            var requiredArgs = GetRequiredArgs().ExpandArgs().RequiredParameters();
+            var optionalArgs = GetOptionalArgs().ExpandArgs().OptionalParameters();
+
+            foreach (var requiredArg in requiredArgs)
+            {
+                AssertCommandTranslation(
+                    string.Join(" ", requiredArg.Select(x => x.InputPattern).ToArray()),
+                    requiredArg.SelectMany(x => x.Output)
+                    
+                    .Concat(new string[] {
+                        // Special case: the encoding must always be supplied
+                        "utf-8",
+                        // Special case: normalize must always be supplied
+                        "false"
+                    }).ToArray());
+            }
+
+            foreach (var requiredArg in requiredArgs)
+            {
+                foreach (var optionalArg in optionalArgs)
+                {
+                    string command = string.Join(" ", requiredArg.Select(x => x.InputPattern).Union(optionalArg.Select(x => x.InputPattern).ToArray()));
+                    string[] expected = requiredArg.SelectMany(x => x.Output)
+                        // Special case: the encoding must always be supplied
+                        .Concat(Regex.IsMatch(command, "-e|--encoding") ? new string[] { "UTF-16" } : new string[] { "utf-8" })
+                        // Special case: the encoding must always be supplied
+                        .Concat(Regex.IsMatch(command, "-n|--normalize") ? new string[] { "true" } : new string[] { "false" }).ToArray();
+                    AssertCommandTranslation(command, expected);
+                }
+            }
+        }
+
+        [Test]
+        [LuceneNetSpecific]
+        public virtual void TestNotEnoughArguments()
+        {
+            AssertConsoleOutput("one two", FromResource("NotEnoughArguments", 3));
+        }
+
+        [Test]
+        [LuceneNetSpecific]
+        public virtual void TestTooManyArguments()
+        {
+            Assert.Throws<CommandParsingException>(() => AssertConsoleOutput("one two three four", ""));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/Resources/Strings.Designer.cs
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/Resources/Strings.Designer.cs b/src/tools/lucene-cli/Resources/Strings.Designer.cs
index 5d1fa93..9af44ff 100644
--- a/src/tools/lucene-cli/Resources/Strings.Designer.cs
+++ b/src/tools/lucene-cli/Resources/Strings.Designer.cs
@@ -69,6 +69,60 @@ namespace Lucene.Net.Cli.Resources {
         }
         
         /// <summary>
+        ///    Looks up a localized string similar to Builds a custom dictionary that can be used by the JapaneseAnalyzer or JapaneseTokenizer..
+        /// </summary>
+        public static string AnalysisKuromojiBuildDictionaryCommandDescription {
+            get {
+                return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandDescription", resourceCulture);
+            }
+        }
+        
+        /// <summary>
+        ///    Looks up a localized string similar to The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed..
+        /// </summary>
+        public static string AnalysisKuromojiBuildDictionaryCommandFormatDescription {
+            get {
+                return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandFormatDescription", resourceCulture);
+            }
+        }
+        
+        /// <summary>
+        ///    Looks up a localized string similar to The directory where the dictionary input files are located..
+        /// </summary>
+        public static string AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription {
+            get {
+                return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription", resourceCulture);
+            }
+        }
+        
+        /// <summary>
+        ///    Looks up a localized string similar to The file encoding used by the input files. If not supplied, the default value is `UTF-8`..
+        /// </summary>
+        public static string AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription {
+            get {
+                return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription", resourceCulture);
+            }
+        }
+        
+        /// <summary>
+        ///    Looks up a localized string similar to Normalize the entries using normalization form KC..
+        /// </summary>
+        public static string AnalysisKuromojiBuildDictionaryCommandNormalizeDescription {
+            get {
+                return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandNormalizeDescription", resourceCulture);
+            }
+        }
+        
+        /// <summary>
+        ///    Looks up a localized string similar to The directory to put the dictionary output..
+        /// </summary>
+        public static string AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription {
+            get {
+                return ResourceManager.GetString("AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription", resourceCulture);
+            }
+        }
+        
+        /// <summary>
         ///    Looks up a localized string similar to Compiles a stemmer table for the Egothor stemmer..
         /// </summary>
         public static string AnalysisStempelCompileStemsCommandDescription {

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/Resources/Strings.resx
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/Resources/Strings.resx b/src/tools/lucene-cli/Resources/Strings.resx
index 64be738..727cb62 100644
--- a/src/tools/lucene-cli/Resources/Strings.resx
+++ b/src/tools/lucene-cli/Resources/Strings.resx
@@ -120,6 +120,24 @@
   <data name="AnalysisCommandDescription" xml:space="preserve">
     <value>Utilities to manage specialized analyzers.</value>
   </data>
+  <data name="AnalysisKuromojiBuildDictionaryCommandDescription" xml:space="preserve">
+    <value>Builds a custom dictionary that can be used by the JapaneseAnalyzer or JapaneseTokenizer.</value>
+  </data>
+  <data name="AnalysisKuromojiBuildDictionaryCommandFormatDescription" xml:space="preserve">
+    <value>The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed.</value>
+  </data>
+  <data name="AnalysisKuromojiBuildDictionaryCommandInputDirectoryDescription" xml:space="preserve">
+    <value>The directory where the dictionary input files are located.</value>
+  </data>
+  <data name="AnalysisKuromojiBuildDictionaryCommandInputDirectoryEncodingDescription" xml:space="preserve">
+    <value>The file encoding used by the input files. If not supplied, the default value is `UTF-8`.</value>
+  </data>
+  <data name="AnalysisKuromojiBuildDictionaryCommandNormalizeDescription" xml:space="preserve">
+    <value>Normalize the entries using normalization form KC.</value>
+  </data>
+  <data name="AnalysisKuromojiBuildDictionaryCommandOutputDirectoryDescription" xml:space="preserve">
+    <value>The directory to put the dictionary output.</value>
+  </data>
   <data name="AnalysisStempelCompileStemsCommandDescription" xml:space="preserve">
     <value>Compiles a stemmer table for the Egothor stemmer.</value>
   </data>

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs b/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
index 969bd58..a39eaeb 100644
--- a/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
+++ b/src/tools/lucene-cli/commands/analysis/AnalysisCommand.cs
@@ -27,7 +27,7 @@
                 this.Description = FromResource("Description");
 
                 //this.Commands.Add(new AnalysisICUBuildRBBIRulesCommand.Configuration(options));
-                //this.Commands.Add(new AnalysisKuromojiBuildDictionaryCommand.Configuration(options));
+                this.Commands.Add(new AnalysisKuromojiBuildDictionaryCommand.Configuration(options));
                 this.Commands.Add(new AnalysisStempelCompileStemsCommand.Configuration(options));
                 this.Commands.Add(new AnalysisStempelPatchStemsCommand.Configuration(options));
 

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs b/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs
new file mode 100644
index 0000000..7f10ed7
--- /dev/null
+++ b/src/tools/lucene-cli/commands/analysis/analysis-kuromoji-build-dictionary/AnalysisKuromojiBuildDictionaryCommand.cs
@@ -0,0 +1,95 @@
+using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Cli.CommandLine;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Cli
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class AnalysisKuromojiBuildDictionaryCommand : ICommand
+    {
+        public class Configuration : ConfigurationBase
+        {
+            public Configuration(CommandLineOptions options)
+            {
+                this.Main = (args) => DictionaryBuilder.Main(args);
+
+                this.Name = "kuromoji-build-dictionary";
+                this.Description = FromResource("Description");
+
+                this.Format = this.Argument(
+                    "<FORMAT>",
+                    FromResource("FormatDescription"));
+                this.InputDirectory = this.Argument(
+                    "<INPUT_DIRECTORY>",
+                    FromResource("InputDirectoryDescription"));
+                this.OutputDirectory = this.Argument(
+                    "<OUTPUT_DIRECTORY>",
+                    FromResource("OutputDirectoryDescription"));
+                this.InputDirectoryEncoding = this.Option(
+                    "-e|--encoding <ENCODING>",
+                    FromResource("InputDirectoryEncodingDescription"),
+                    CommandOptionType.SingleValue);
+                this.Normalize = this.Option(
+                    "-n|--normalize",
+                    FromResource("NormalizeDescription"),
+                    CommandOptionType.NoValue);
+
+                this.OnExecute(() => new AnalysisKuromojiBuildDictionaryCommand().Run(this));
+            }
+
+            public virtual CommandArgument Format { get; private set; }
+            public virtual CommandArgument InputDirectory { get; private set; }
+            public virtual CommandArgument OutputDirectory { get; private set; }
+            public virtual CommandOption InputDirectoryEncoding { get; private set; }
+            public virtual CommandOption Normalize { get; private set; }
+        }
+
+        public int Run(ConfigurationBase cmd)
+        {
+            if (!cmd.ValidateArguments(3))
+            {
+                return 1;
+            }
+
+            var input = cmd as Configuration;
+            var args = new List<string>(input.GetNonNullArguments());
+
+            if (input.InputDirectoryEncoding.HasValue())
+            {
+                args.Add(input.InputDirectoryEncoding.Value());
+            }
+            else
+            {
+                args.Add("utf-8");
+            }
+
+            if (input.Normalize.HasValue())
+            {
+                args.Add("true");
+            }
+            else
+            {
+                args.Add("false");
+            }
+
+            cmd.Main(args.ToArray());
+            return 0;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/docs/analysis/index.md
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/docs/analysis/index.md b/src/tools/lucene-cli/docs/analysis/index.md
index c114294..9843805 100644
--- a/src/tools/lucene-cli/docs/analysis/index.md
+++ b/src/tools/lucene-cli/docs/analysis/index.md
@@ -6,5 +6,6 @@ Utilities to manage specialized analyzers.
 
 ## Commands
 
+- [kuromoji-build-dictionary](kuromoji-build-dictionary.md)
 - [stempel-compile-stems](stempel-compile-stems.md)
 - [stempel-patch-stems](stempel-patch-stems.md)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md b/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md
new file mode 100644
index 0000000..9fd7cf6
--- /dev/null
+++ b/src/tools/lucene-cli/docs/analysis/kuromoji-build-dictionary.md
@@ -0,0 +1,46 @@
+# kuromoji-build-dictionary
+
+### Name
+
+`analysis-kuromoji-build-dictionary` - Generates a dictionary file for the JapaneseAnalyzer or JapaneseTokenizer in the Lucene.Net.Analysis.Kuromoji project.
+
+### Synopsis
+
+<code>dotnet lucene-cli.dll analysis kuromoji-build-dictionary <FORMAT> <INPUT_DIRECTORY> <OUTPUT_DIRECTORY> [-e|--encoding] [-n|--normalize] [?|-h|--help]</code>
+
+### Description
+
+See the [Kuromoji project documentation](https://github.com/atilika/kuromoji) for more information.
+
+### Arguments
+
+`FORMAT`
+
+The dictionary format. Valid values are IPADIC and UNIDIC. If an invalid value is passed, IPADIC is assumed.
+
+`INPUT_DIRECTORY`
+
+The directory where the dictionary input files are located.
+
+`OUTPUT_DIRECTORY`
+
+The directory to put the dictionary output.
+
+### Options
+
+`?|-h|--help`
+
+Prints out a short help for the command.
+
+`-e|--encoding <ENCODING>`
+
+The file encoding used by the input files. If not supplied, the default value is `UTF-8`.
+
+`-n|--normalize`
+
+Normalize the entries using normalization form KC.
+
+### Example
+
+<code>dotnet lucene-cli.dll analysis kuromoji-build-dictionary X:\kuromoji-data X:\kuromoji-dictionary --encoding UTF-16</code>
+

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/bacfcc1a/src/tools/lucene-cli/project.json
----------------------------------------------------------------------
diff --git a/src/tools/lucene-cli/project.json b/src/tools/lucene-cli/project.json
index 219964d..767a705 100644
--- a/src/tools/lucene-cli/project.json
+++ b/src/tools/lucene-cli/project.json
@@ -1,4 +1,4 @@
-{
+{
   "version": "4.8.0",
   "entryPoint": "Program",
   "buildOptions": {
@@ -19,6 +19,7 @@
   "dependencies": {
     "Lucene.Net": "4.8.0",
     "Lucene.Net.Analysis.Common": "4.8.0",
+    "Lucene.Net.Analysis.Kuromoji": "4.8.0",
     "Lucene.Net.Analysis.Stempel": "4.8.0",
     "Lucene.Net.Demo": "4.8.0",
     "Lucene.Net.Expressions": "4.8.0",