You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/11/23 14:39:08 UTC

[opennlp] branch master updated: OPENNLP-1385 - Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool (#428)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 905483d2 OPENNLP-1385 - Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool (#428)
905483d2 is described below

commit 905483d23df5ba97a6569d58f934bfabe1f85a5f
Author: Atita Arora <at...@users.noreply.github.com>
AuthorDate: Wed Nov 23 15:39:03 2022 +0100

    OPENNLP-1385 - Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool (#428)
    
    * OPENNLP-1385 : Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool
---
 .../cmdline/tokenizer/TokenizerTrainerTool.java    |   7 +-
 .../tools/cmdline/tokenizer/TrainingParams.java    |  10 ++
 .../opennlp/tools/util/TrainingParameters.java     |  23 +++-
 .../tokenizer/TokenizerTrainerToolTest.java        | 140 +++++++++++++++++++++
 .../opennlp/tools/util/TrainingParametersTest.java |  64 ++++++++++
 5 files changed, 240 insertions(+), 4 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
index bcf37dea..eb39ff4d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
@@ -32,6 +32,7 @@ import opennlp.tools.ml.TrainerFactory.TrainerType;
 import opennlp.tools.tokenize.TokenSample;
 import opennlp.tools.tokenize.TokenizerFactory;
 import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
 public final class TokenizerTrainerTool
@@ -59,8 +60,10 @@ public final class TokenizerTrainerTool
 
   public void run(String format, String[] args) {
     super.run(format, args);
-
-    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (null != params.getParams())
+      mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    else
+      mlParams = TrainingParameters.setParams(args);
 
     if (mlParams != null) {
       if (!TrainerFactory.isValid(mlParams)) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
index 237173aa..358fc476 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
@@ -42,4 +42,14 @@ interface TrainingParams extends BasicTrainingParams {
       description = "A sub-class of TokenizerFactory where to get implementation and resources.")
   @OptionalParameter
   String getFactory();
+
+  @ParameterDescription(valueName = "cutOffNum",
+          description = "Minimal number of times a feature must be seen")
+  @OptionalParameter
+  String getCutoff();
+
+  @ParameterDescription(valueName = "iterationsNum",
+          description = "Number of training iterations")
+  @OptionalParameter
+  String getIterations();
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java b/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
index d69a9b14..f9048424 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
@@ -27,6 +27,7 @@ import java.util.Map.Entry;
 import java.util.Properties;
 import java.util.TreeMap;
 
+import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.ml.EventTrainer;
 
 public class TrainingParameters {
@@ -38,6 +39,8 @@ public class TrainingParameters {
   public static final String ITERATIONS_PARAM = "Iterations";
   public static final String CUTOFF_PARAM = "Cutoff";
   public static final String THREADS_PARAM = "Threads";
+  public static final int ITERATIONS_DEFAULT_VALUE = 100;
+  public static final int CUTOFF_DEFAULT_VALUE = 5;
 
   private Map<String, Object> parameters = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 
@@ -448,8 +451,24 @@ public class TrainingParameters {
     TrainingParameters mlParams = new TrainingParameters();
     mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
     mlParams.put(TrainingParameters.TRAINER_TYPE_PARAM, EventTrainer.EVENT_VALUE);
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, 5);
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, ITERATIONS_DEFAULT_VALUE);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, CUTOFF_DEFAULT_VALUE);
+
+    return mlParams;
+  }
+
+  public static TrainingParameters setParams(String[] args) {
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM , "MAXENT");
+    mlParams.put(TrainingParameters.TRAINER_TYPE_PARAM , EventTrainer.EVENT_VALUE);
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM ,
+        null != CmdLineUtil.getIntParameter("-" + TrainingParameters.ITERATIONS_PARAM.toLowerCase() , args) ?
+            CmdLineUtil.getIntParameter("-" + TrainingParameters.ITERATIONS_PARAM.toLowerCase() , args) :
+            ITERATIONS_DEFAULT_VALUE);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM ,
+        null != CmdLineUtil.getIntParameter("-" + TrainingParameters.CUTOFF_PARAM.toLowerCase() , args) ?
+            CmdLineUtil.getIntParameter("-" + TrainingParameters.CUTOFF_PARAM.toLowerCase() , args) :
+            CUTOFF_DEFAULT_VALUE);
 
     return mlParams;
   }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java
new file mode 100644
index 00000000..b079bf64
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.tokenizer;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+
+
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
+
+/**
+ * Tests for the {@link TokenizerTrainerTool} class.
+ */
+public class TokenizerTrainerToolTest {
+
+  private TokenizerTrainerTool tokenizerTrainerTool;
+
+  @TempDir
+  public Path tempFolder;
+
+  private String sampleSuccessData =
+      "Pierre Vinken<SPLIT>, 61 years old<SPLIT>, will join the board as a nonexecutive " +
+          "director Nov. 29<SPLIT>.\n" +
+          "Mr. Vinken is chairman of Elsevier N.V.<SPLIT>, the Dutch publishing group<SPLIT>.\n" +
+          "Rudolph Agnew<SPLIT>, 55 years old and former chairman of Consolidated Gold Fields PLC<SPLIT>,\n" +
+          "    was named a nonexecutive director of this British industrial conglomerate<SPLIT>.\n";
+
+  private String sampleFailureData = "It is Fail Test Case.\n\nNothing in this sentence.";
+
+  @BeforeEach
+  void setUp() {
+  }
+
+  @AfterEach
+  void tearDown() {
+  }
+
+  @Test
+  public void testGetShortDescription() {
+    tokenizerTrainerTool = new TokenizerTrainerTool();
+    Assertions.assertEquals(tokenizerTrainerTool.getShortDescription() ,
+        "trainer for the learnable tokenizer");
+  }
+
+  @Test
+  public void testLoadDictHappyCase() throws IOException {
+    File dictFile = new File("lang/ga/sentdetect/abb.xml");
+    Dictionary dict = TokenizerTrainerTool.loadDict(dictFile);
+    Assertions.assertNotNull(dict);
+  }
+
+  @Test
+  public void testLoadDictFailCase() throws IOException {
+    Assertions.assertThrows(InvalidFormatException.class , () -> {
+      Dictionary dictionary = TokenizerTrainerTool.loadDict(prepareDataFile(""));
+    });
+  }
+
+  @Test()
+  public void testTestRunHappyCase() throws IOException {
+    File model = tempFolder.resolve("model-en.bin").toFile();
+
+    String[] args =
+        new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
+            "-data" , String.valueOf(prepareDataFile(sampleSuccessData)) , "-encoding" , "UTF-8" };
+
+    InputStream stream = new ByteArrayInputStream(sampleSuccessData.getBytes(StandardCharsets.UTF_8));
+    System.setIn(stream);
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    PrintStream ps = new PrintStream(baos);
+    System.setOut(ps);
+
+    tokenizerTrainerTool = new TokenizerTrainerTool();
+    tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
+
+    final String content = new String(baos.toByteArray() , StandardCharsets.UTF_8);
+    Assertions.assertTrue(content.contains("Number of Event Tokens: 171"));
+    model.delete();
+  }
+
+  @Test
+  public void testTestRunExceptionCase() throws IOException {
+    File model = tempFolder.resolve("model-en.bin").toFile();
+    model.deleteOnExit();
+
+    String[] args =
+        new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
+            "-data" , String.valueOf(prepareDataFile(sampleFailureData)) , "-encoding" , "UTF-8" };
+
+    InputStream stream = new ByteArrayInputStream(sampleFailureData.getBytes(StandardCharsets.UTF_8));
+    System.setIn(stream);
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    PrintStream ps = new PrintStream(baos);
+    System.setOut(ps);
+
+    Assertions.assertThrows(TerminateToolException.class , () -> {
+      tokenizerTrainerTool = new TokenizerTrainerTool();
+      tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
+    });
+
+  }
+
+  private File prepareDataFile(String input) throws IOException {
+    // This is guaranteed to be deleted after the test finishes.
+    File dataFile = tempFolder.resolve("data-en.train").toFile();
+    FileUtils.writeStringToFile(dataFile , input , "ISO-8859-1");
+    return dataFile;
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
index de3a094b..64a8880c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
@@ -60,6 +60,70 @@ public class TrainingParametersTest {
             200));  // use different defaults
   }
 
+  @Test
+  public void testSetParamsWithCLIParams() {
+    String[] args =
+        { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+            "en-token.train" , "-encoding" , "UTF-8" , "-cutoff" , "10" , "-iterations" , "50" };
+    TrainingParameters tr = TrainingParameters.setParams(args);
+
+    Assertions.assertEquals("MAXENT" , tr.algorithm());
+    Assertions.assertEquals(50 ,
+        tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+            TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+    Assertions.assertEquals(10 ,
+        tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+            TrainingParameters.CUTOFF_DEFAULT_VALUE));
+  }
+
+  @Test
+  public void testSetParamsWithoutCLIParams() {
+    String[] args =
+        { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+            "en-token.train" , "-encoding" , "UTF-8" };
+    TrainingParameters tr = TrainingParameters.setParams(args);
+
+    Assertions.assertEquals("MAXENT" , tr.algorithm());
+    Assertions.assertEquals(100 ,
+        tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+            TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+    Assertions.assertEquals(5 ,
+        tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+            TrainingParameters.CUTOFF_DEFAULT_VALUE));
+  }
+
+  @Test
+  public void testSetParamsWithoutCutoffCLIParams() {
+    String[] args =
+        { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+            "en-token.train" , "-encoding" , "UTF-8" , "-iterations" , "50" };
+    TrainingParameters tr = TrainingParameters.setParams(args);
+
+    Assertions.assertEquals("MAXENT" , tr.algorithm());
+    Assertions.assertEquals(50 ,
+        tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+            TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+    Assertions.assertEquals(5 ,
+        tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+            TrainingParameters.CUTOFF_DEFAULT_VALUE));
+  }
+
+  @Test
+  public void testSetParamsWithoutIterationsCLIParams() {
+    String[] args =
+        { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+            "en-token.train" , "-encoding" , "UTF-8" , "-cutoff" , "10" };
+    TrainingParameters tr = TrainingParameters.setParams(args);
+
+    Assertions.assertEquals("MAXENT" , tr.algorithm());
+    Assertions.assertEquals(100 ,
+        tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+            TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+    Assertions.assertEquals(10 ,
+        tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+            TrainingParameters.CUTOFF_DEFAULT_VALUE));
+  }
+
   @Test
   void testGetAlgorithm() {
     TrainingParameters tp = build("Algorithm=Perceptron,n1.Algorithm=SVM");