You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/11/23 14:39:08 UTC
[opennlp] branch master updated: OPENNLP-1385 - Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool (#428)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 905483d2 OPENNLP-1385 - Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool (#428)
905483d2 is described below
commit 905483d23df5ba97a6569d58f934bfabe1f85a5f
Author: Atita Arora <at...@users.noreply.github.com>
AuthorDate: Wed Nov 23 15:39:03 2022 +0100
OPENNLP-1385 - Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool (#428)
* OPENNLP-1385 : Adding the support for Cutoff and Iteration Params to the tokenizer CLI tool
---
.../cmdline/tokenizer/TokenizerTrainerTool.java | 7 +-
.../tools/cmdline/tokenizer/TrainingParams.java | 10 ++
.../opennlp/tools/util/TrainingParameters.java | 23 +++-
.../tokenizer/TokenizerTrainerToolTest.java | 140 +++++++++++++++++++++
.../opennlp/tools/util/TrainingParametersTest.java | 64 ++++++++++
5 files changed, 240 insertions(+), 4 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
index bcf37dea..eb39ff4d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
@@ -32,6 +32,7 @@ import opennlp.tools.ml.TrainerFactory.TrainerType;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
public final class TokenizerTrainerTool
@@ -59,8 +60,10 @@ public final class TokenizerTrainerTool
public void run(String format, String[] args) {
super.run(format, args);
-
- mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+ if (null != params.getParams())
+ mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+ else
+ mlParams = TrainingParameters.setParams(args);
if (mlParams != null) {
if (!TrainerFactory.isValid(mlParams)) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
index 237173aa..358fc476 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
@@ -42,4 +42,14 @@ interface TrainingParams extends BasicTrainingParams {
description = "A sub-class of TokenizerFactory where to get implementation and resources.")
@OptionalParameter
String getFactory();
+
+ @ParameterDescription(valueName = "cutOffNum",
+ description = "Minimal number of times a feature must be seen")
+ @OptionalParameter
+ String getCutoff();
+
+ @ParameterDescription(valueName = "iterationsNum",
+ description = "Number of training iterations")
+ @OptionalParameter
+ String getIterations();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java b/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
index d69a9b14..f9048424 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
@@ -27,6 +27,7 @@ import java.util.Map.Entry;
import java.util.Properties;
import java.util.TreeMap;
+import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.ml.EventTrainer;
public class TrainingParameters {
@@ -38,6 +39,8 @@ public class TrainingParameters {
public static final String ITERATIONS_PARAM = "Iterations";
public static final String CUTOFF_PARAM = "Cutoff";
public static final String THREADS_PARAM = "Threads";
+ public static final int ITERATIONS_DEFAULT_VALUE = 100;
+ public static final int CUTOFF_DEFAULT_VALUE = 5;
private Map<String, Object> parameters = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
@@ -448,8 +451,24 @@ public class TrainingParameters {
TrainingParameters mlParams = new TrainingParameters();
mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
mlParams.put(TrainingParameters.TRAINER_TYPE_PARAM, EventTrainer.EVENT_VALUE);
- mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
- mlParams.put(TrainingParameters.CUTOFF_PARAM, 5);
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM, ITERATIONS_DEFAULT_VALUE);
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, CUTOFF_DEFAULT_VALUE);
+
+ return mlParams;
+ }
+
+ public static TrainingParameters setParams(String[] args) {
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM , "MAXENT");
+ mlParams.put(TrainingParameters.TRAINER_TYPE_PARAM , EventTrainer.EVENT_VALUE);
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM ,
+ null != CmdLineUtil.getIntParameter("-" + TrainingParameters.ITERATIONS_PARAM.toLowerCase() , args) ?
+ CmdLineUtil.getIntParameter("-" + TrainingParameters.ITERATIONS_PARAM.toLowerCase() , args) :
+ ITERATIONS_DEFAULT_VALUE);
+ mlParams.put(TrainingParameters.CUTOFF_PARAM ,
+ null != CmdLineUtil.getIntParameter("-" + TrainingParameters.CUTOFF_PARAM.toLowerCase() , args) ?
+ CmdLineUtil.getIntParameter("-" + TrainingParameters.CUTOFF_PARAM.toLowerCase() , args) :
+ CUTOFF_DEFAULT_VALUE);
return mlParams;
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java
new file mode 100644
index 00000000..b079bf64
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerToolTest.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.tokenizer;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+
+
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
+
+/**
+ * Tests for the {@link TokenizerTrainerTool} class.
+ */
+public class TokenizerTrainerToolTest {
+
+ private TokenizerTrainerTool tokenizerTrainerTool;
+
+ @TempDir
+ public Path tempFolder;
+
+ private String sampleSuccessData =
+ "Pierre Vinken<SPLIT>, 61 years old<SPLIT>, will join the board as a nonexecutive " +
+ "director Nov. 29<SPLIT>.\n" +
+ "Mr. Vinken is chairman of Elsevier N.V.<SPLIT>, the Dutch publishing group<SPLIT>.\n" +
+ "Rudolph Agnew<SPLIT>, 55 years old and former chairman of Consolidated Gold Fields PLC<SPLIT>,\n" +
+ " was named a nonexecutive director of this British industrial conglomerate<SPLIT>.\n";
+
+ private String sampleFailureData = "It is Fail Test Case.\n\nNothing in this sentence.";
+
+ @BeforeEach
+ void setUp() {
+ }
+
+ @AfterEach
+ void tearDown() {
+ }
+
+ @Test
+ public void testGetShortDescription() {
+ tokenizerTrainerTool = new TokenizerTrainerTool();
+ Assertions.assertEquals(tokenizerTrainerTool.getShortDescription() ,
+ "trainer for the learnable tokenizer");
+ }
+
+ @Test
+ public void testLoadDictHappyCase() throws IOException {
+ File dictFile = new File("lang/ga/sentdetect/abb.xml");
+ Dictionary dict = TokenizerTrainerTool.loadDict(dictFile);
+ Assertions.assertNotNull(dict);
+ }
+
+ @Test
+ public void testLoadDictFailCase() throws IOException {
+ Assertions.assertThrows(InvalidFormatException.class , () -> {
+ Dictionary dictionary = TokenizerTrainerTool.loadDict(prepareDataFile(""));
+ });
+ }
+
+ @Test()
+ public void testTestRunHappyCase() throws IOException {
+ File model = tempFolder.resolve("model-en.bin").toFile();
+
+ String[] args =
+ new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
+ "-data" , String.valueOf(prepareDataFile(sampleSuccessData)) , "-encoding" , "UTF-8" };
+
+ InputStream stream = new ByteArrayInputStream(sampleSuccessData.getBytes(StandardCharsets.UTF_8));
+ System.setIn(stream);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ PrintStream ps = new PrintStream(baos);
+ System.setOut(ps);
+
+ tokenizerTrainerTool = new TokenizerTrainerTool();
+ tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
+
+ final String content = new String(baos.toByteArray() , StandardCharsets.UTF_8);
+ Assertions.assertTrue(content.contains("Number of Event Tokens: 171"));
+ model.delete();
+ }
+
+ @Test
+ public void testTestRunExceptionCase() throws IOException {
+ File model = tempFolder.resolve("model-en.bin").toFile();
+ model.deleteOnExit();
+
+ String[] args =
+ new String[] { "-model" , model.getAbsolutePath() , "-alphaNumOpt" , "false" , "-lang" , "en" ,
+ "-data" , String.valueOf(prepareDataFile(sampleFailureData)) , "-encoding" , "UTF-8" };
+
+ InputStream stream = new ByteArrayInputStream(sampleFailureData.getBytes(StandardCharsets.UTF_8));
+ System.setIn(stream);
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ PrintStream ps = new PrintStream(baos);
+ System.setOut(ps);
+
+ Assertions.assertThrows(TerminateToolException.class , () -> {
+ tokenizerTrainerTool = new TokenizerTrainerTool();
+ tokenizerTrainerTool.run(StreamFactoryRegistry.DEFAULT_FORMAT , args);
+ });
+
+ }
+
+ private File prepareDataFile(String input) throws IOException {
+ // This is guaranteed to be deleted after the test finishes.
+ File dataFile = tempFolder.resolve("data-en.train").toFile();
+ FileUtils.writeStringToFile(dataFile , input , "ISO-8859-1");
+ return dataFile;
+ }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
index de3a094b..64a8880c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
@@ -60,6 +60,70 @@ public class TrainingParametersTest {
200)); // use different defaults
}
+ @Test
+ public void testSetParamsWithCLIParams() {
+ String[] args =
+ { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+ "en-token.train" , "-encoding" , "UTF-8" , "-cutoff" , "10" , "-iterations" , "50" };
+ TrainingParameters tr = TrainingParameters.setParams(args);
+
+ Assertions.assertEquals("MAXENT" , tr.algorithm());
+ Assertions.assertEquals(50 ,
+ tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+ TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+ Assertions.assertEquals(10 ,
+ tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+ TrainingParameters.CUTOFF_DEFAULT_VALUE));
+ }
+
+ @Test
+ public void testSetParamsWithoutCLIParams() {
+ String[] args =
+ { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+ "en-token.train" , "-encoding" , "UTF-8" };
+ TrainingParameters tr = TrainingParameters.setParams(args);
+
+ Assertions.assertEquals("MAXENT" , tr.algorithm());
+ Assertions.assertEquals(100 ,
+ tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+ TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+ Assertions.assertEquals(5 ,
+ tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+ TrainingParameters.CUTOFF_DEFAULT_VALUE));
+ }
+
+ @Test
+ public void testSetParamsWithoutCutoffCLIParams() {
+ String[] args =
+ { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+ "en-token.train" , "-encoding" , "UTF-8" , "-iterations" , "50" };
+ TrainingParameters tr = TrainingParameters.setParams(args);
+
+ Assertions.assertEquals("MAXENT" , tr.algorithm());
+ Assertions.assertEquals(50 ,
+ tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+ TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+ Assertions.assertEquals(5 ,
+ tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+ TrainingParameters.CUTOFF_DEFAULT_VALUE));
+ }
+
+ @Test
+ public void testSetParamsWithoutIterationsCLIParams() {
+ String[] args =
+ { "-model" , "en-token-test.bin" , "-alphaNumOpt" , "isAlphaNumOpt" , "-lang" , "en" , "-data" ,
+ "en-token.train" , "-encoding" , "UTF-8" , "-cutoff" , "10" };
+ TrainingParameters tr = TrainingParameters.setParams(args);
+
+ Assertions.assertEquals("MAXENT" , tr.algorithm());
+ Assertions.assertEquals(100 ,
+ tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM ,
+ TrainingParameters.ITERATIONS_DEFAULT_VALUE));
+ Assertions.assertEquals(10 ,
+ tr.getIntParameter(TrainingParameters.CUTOFF_PARAM ,
+ TrainingParameters.CUTOFF_DEFAULT_VALUE));
+ }
+
@Test
void testGetAlgorithm() {
TrainingParameters tp = build("Algorithm=Perceptron,n1.Algorithm=SVM");