You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/03/28 13:56:31 UTC
[opennlp] branch master updated: OPENNLP-1318: Add automatic model downloading. (#383)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 70b6147 OPENNLP-1318: Add automatic model downloading. (#383)
70b6147 is described below
commit 70b614773d1a74aecadd7794abfdbf45b36676b7
Author: Jeff Zemerick <je...@mtnfog.com>
AuthorDate: Mon Mar 28 09:56:28 2022 -0400
OPENNLP-1318: Add automatic model downloading. (#383)
* OPENNLP-1318: Adding automatic model downloads.
---
.../main/java/opennlp/tools/chunker/ChunkerME.java | 11 ++
.../cmdline/namefind/TokenNameFinderTool.java | 2 +-
.../java/opennlp/tools/namefind/NameFinderME.java | 4 +-
.../java/opennlp/tools/postag/POSTaggerME.java | 10 ++
.../tools/sentdetect/SentenceDetectorME.java | 11 ++
.../java/opennlp/tools/tokenize/TokenizerME.java | 11 ++
.../main/java/opennlp/tools/util/DownloadUtil.java | 172 +++++++++++++++++++++
.../java/opennlp/tools/util/model/BaseModel.java | 1 -
.../java/opennlp/tools/chunker/ChunkerMEIT.java | 48 ++++++
.../java/opennlp/tools/chunker/ChunkerMETest.java | 10 ++
.../tools/cmdline/TokenNameFinderToolTest.java | 2 +-
.../java/opennlp/tools/postag/POSTaggerMEIT.java | 49 ++++++
...tectorMETest.java => SentenceDetectorMEIT.java} | 49 +-----
.../tools/sentdetect/SentenceDetectorMETest.java | 85 ++++++++++
.../java/opennlp/tools/tokenize/TokenizerMEIT.java | 39 +++++
.../opennlp/tools/tokenize/TokenizerMETest.java | 12 ++
pom.xml | 25 +++
17 files changed, 490 insertions(+), 51 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
index 4346df3..507b349 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
@@ -30,6 +30,7 @@ import opennlp.tools.ml.TrainerFactory.TrainerType;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
@@ -108,6 +109,16 @@ public class ChunkerME implements Chunker {
}
/**
+ * Initializes the chunker by downloading a default model.
+ * @param language The language of the model.
+ * @throws IOException Thrown if the model cannot be downloaded or saved.
+ */
+ public ChunkerME(String language) throws IOException {
+ this((ChunkerModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.CHUNKER,
+ ChunkerModel.class));
+ }
+
+ /**
* Initializes the current instance with the specified model.
* The default beam size is used.
*
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
index 551215f..2fa9970 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
@@ -69,7 +69,7 @@ public final class TokenNameFinderTool extends BasicCmdLineTool {
try {
untokenizedLineStream = new PlainTextByLineStream(
- new SystemInputStreamFactory(), SystemInputStreamFactory.encoding());
+ new SystemInputStreamFactory(), SystemInputStreamFactory.encoding());
String line;
while ((line = untokenizedLineStream.read()) != null) {
String[] whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
index 12ce701..7a3b98a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
@@ -218,8 +218,8 @@ public class NameFinderME implements TokenNameFinder {
}
public static TokenNameFinderModel train(String languageCode, String type,
- ObjectStream<NameSample> samples, TrainingParameters trainParams,
- TokenNameFinderFactory factory) throws IOException {
+ ObjectStream<NameSample> samples, TrainingParameters trainParams,
+ TokenNameFinderFactory factory) throws IOException {
trainParams.putIfAbsent(TrainingParameters.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
trainParams.putIfAbsent(TrainingParameters.CUTOFF_PARAM, 0);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
index 1edcf4b..95ffffe 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
@@ -36,6 +36,7 @@ import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.SequenceClassificationModel;
import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
@@ -87,6 +88,15 @@ public class POSTaggerME implements POSTagger {
private SequenceValidator<String> sequenceValidator;
/**
+ * Initializes the sentence detector by downloading a default model.
+ * @param language The language of the POS tagger
+ * @throws IOException Thrown if the model cannot be downloaded or saved.
+ */
+ public POSTaggerME(String language) throws IOException {
+ this((POSModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.POS, POSModel.class));
+ }
+
+ /**
* Initializes the current instance with the provided model.
*
* @param model
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index b5ad804..b15e08d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -32,6 +32,7 @@ import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.sentdetect.lang.Factory;
+import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
@@ -79,6 +80,16 @@ public class SentenceDetectorME implements SentenceDetector {
protected boolean useTokenEnd;
/**
+ * Initializes the sentence detector by downloading a default model.
+ * @param language The language of the sentence detector.
+ * @throws IOException Thrown if the model cannot be downloaded or saved.
+ */
+ public SentenceDetectorME(String language) throws IOException {
+ this((SentenceModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.SENTENCE_DETECTOR,
+ SentenceModel.class));
+ }
+
+ /**
* Initializes the current instance.
*
* @param model the {@link SentenceModel}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 6d54308..9ecdf13 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -32,6 +32,7 @@ import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.tokenize.lang.Factory;
+import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
@@ -118,6 +119,16 @@ public class TokenizerME extends AbstractTokenizer {
private List<Span> newTokens;
+ /**
+ * Initializes the tokenizer by downloading a default model.
+ * @param language The language of the tokenizer.
+ * @throws IOException Thrown if the model cannot be downloaded or saved.
+ */
+ public TokenizerME(String language) throws IOException {
+ this((TokenizerModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.TOKENIZER,
+ TokenizerModel.class));
+ }
+
public TokenizerME(TokenizerModel model) {
TokenizerFactory factory = model.getFactory();
this.alphanumeric = factory.getAlphaNumericPattern();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
new file mode 100644
index 0000000..1969789
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * This class facilitates the downloading of pretrained OpenNLP models.
+ */
+public class DownloadUtil {
+
+ /**
+ * The type of model.
+ */
+ public enum ModelType {
+ TOKENIZER("token"),
+ SENTENCE_DETECTOR("sent"),
+ POS("pos-perceptron"),
+ NAME_FINDER("ner"),
+ CHUNKER("chunker"),
+ PARSER("parser-chunking");
+
+ private String name;
+
+ ModelType(String name) {
+ this.name = name;
+ }
+ }
+
+ private static final String baseUrl = "https://dlcdn.apache.org/opennlp/";
+
+ public static Map<String, Map<ModelType, String>> available_models = new HashMap<>();
+
+ static {
+
+ final Map<ModelType, String> frenchModels = new HashMap<>();
+ frenchModels.put(ModelType.SENTENCE_DETECTOR,
+ baseUrl + "models/ud-models-1.0/opennlp-1.0-1.9.3fr-ud-ftb-sentence-1.0-1.9.3.bin");
+ frenchModels.put(ModelType.POS,
+ baseUrl + "models/ud-models-1.0/opennlp-fr-ud-ftb-pos-1.0-1.9.3.bin");
+ frenchModels.put(ModelType.TOKENIZER,
+ baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin");
+ available_models.put("fr", frenchModels);
+
+ final Map<ModelType, String> germanModels = new HashMap<>();
+ germanModels.put(ModelType.SENTENCE_DETECTOR,
+ baseUrl + "models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin");
+ germanModels.put(ModelType.POS,
+ baseUrl + "models/ud-models-1.0/opennlp-de-ud-gsd-pos-1.0-1.9.3.bin");
+ germanModels.put(ModelType.TOKENIZER,
+ baseUrl + "models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin");
+ available_models.put("de", germanModels);
+
+ final Map<ModelType, String> englishModels = new HashMap<>();
+ englishModels.put(ModelType.SENTENCE_DETECTOR,
+ baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin");
+ englishModels.put(ModelType.POS,
+ baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-pos-1.0-1.9.3.bin");
+ englishModels.put(ModelType.TOKENIZER,
+ baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin");
+ available_models.put("en", englishModels);
+
+ final Map<ModelType, String> italianModels = new HashMap<>();
+ italianModels.put(ModelType.SENTENCE_DETECTOR,
+ baseUrl + "models/ud-models-1.0/opennlp-it-ud-vit-sentence-1.0-1.9.3.bin");
+ italianModels.put(ModelType.POS,
+ baseUrl + "models/ud-models-1.0/opennlp-it-ud-vit-pos-1.0-1.9.3.bin");
+ italianModels.put(ModelType.TOKENIZER,
+ baseUrl + "models/ud-models-1.0/opennlp-it-ud-vit-sentence-1.0-1.9.3.bin");
+ available_models.put("it", italianModels);
+
+ final Map<ModelType, String> dutchModels = new HashMap<>();
+ dutchModels.put(ModelType.SENTENCE_DETECTOR,
+ baseUrl + "models/opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin");
+ dutchModels.put(ModelType.POS,
+ baseUrl + "models/ud-models-1.0/opennlp-nl-ud-alpino-pos-1.0-1.9.3.bin");
+ dutchModels.put(ModelType.TOKENIZER,
+ baseUrl + "models/ud-models-1.0/opennlp-nl-ud-alpino-tokens-1.0-1.9.3.bin");
+ available_models.put("nl", dutchModels);
+
+ }
+
+ public static BaseModel downloadModel(String language, ModelType modelType, Class type)
+ throws IOException {
+
+ if (available_models.containsKey(language)) {
+ final String url = (available_models.get(language).get(modelType));
+ if (url != null) {
+ return downloadModel(new URL(url), type);
+ }
+ }
+
+ throw new IOException("Invalid model.");
+ }
+
+ /**
+ * Downloads a model from a URL. The model is saved to an .opennlp/ directory
+ * located under the user's home directory. This directory will be created
+ * if it does not already exist. If a model to be downloaded already
+ * exists in that directory, the model will not be re-downloaded.
+ *
+ * @param url The model's URL.
+ * @return A {@link TokenNameFinderModel}.
+ * @throws IOException Thrown if the model cannot be downloaded.
+ */
+ public static BaseModel downloadModel(URL url, Class type) throws IOException {
+
+ final Path homeDirectory = Paths.get(System.getProperty("user.home") + "/.opennlp/");
+ if (!Files.isDirectory(homeDirectory)) {
+ homeDirectory.toFile().mkdir();
+ }
+
+ final String filename = url.toString().substring(url.toString().lastIndexOf("/") + 1);
+ final Path localFile = Paths.get(homeDirectory.toString(), filename);
+
+ if (!Files.exists(localFile)) {
+
+ System.out.println("Downloading model " + url + " to " + localFile);
+
+ try (final InputStream in = url.openStream()) {
+ Files.copy(in, localFile, StandardCopyOption.REPLACE_EXISTING);
+ }
+
+ System.out.println("Download complete.");
+
+ }
+
+ if (type == TokenizerModel.class) {
+ return new TokenizerModel(localFile);
+ } else if (type == ChunkerModel.class) {
+ return new ChunkerModel(localFile);
+ } else if (type == SentenceModel.class) {
+ return new SentenceModel(localFile);
+ } else if (type == POSModel.class) {
+ return new POSModel(localFile);
+ } else {
+ return new TokenNameFinderModel(localFile);
+ }
+
+ }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
index a6a9a22..8c3d716 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
@@ -502,7 +502,6 @@ public abstract class BaseModel implements ArtifactProvider, Serializable {
*/
public final String getManifestProperty(String key) {
Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
-
return manifest.getProperty(key);
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMEIT.java
new file mode 100644
index 0000000..6d7f410
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMEIT.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.chunker;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ChunkerMEIT {
+
+ private static String[] toks1 = { "Rockwell", "said", "the", "agreement", "calls", "for",
+ "it", "to", "supply", "200", "additional", "so-called", "shipsets",
+ "for", "the", "planes", "." };
+
+ private static String[] tags1 = { "NNP", "VBD", "DT", "NN", "VBZ", "IN", "PRP", "TO", "VB",
+ "CD", "JJ", "JJ", "NNS", "IN", "DT", "NNS", "." };
+
+ private static String[] expect1 = { "B-NP", "B-VP", "B-NP", "I-NP", "B-VP", "B-SBAR",
+ "B-NP", "B-VP", "I-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-PP", "B-NP",
+ "I-NP", "O" };
+
+ @Test
+ public void downloadModel() throws IOException {
+
+ ChunkerME chunker = new ChunkerME("en");
+
+ String[] preds = chunker.chunk(toks1, tags1);
+
+ Assert.assertArrayEquals(expect1, preds);
+ }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
index b7654fb..cfbd815 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
@@ -64,6 +64,16 @@ public class ChunkerMETest {
"B-NP", "B-VP", "I-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-PP", "B-NP",
"I-NP", "O" };
+ @Test(expected = IOException.class)
+ public void downloadNonExistentModel() throws IOException {
+
+ ChunkerME chunker = new ChunkerME("en");
+
+ String[] preds = chunker.chunk(toks1, tags1);
+
+ Assert.assertArrayEquals(expect1, preds);
+ }
+
@Before
public void startup() throws IOException {
// train the chunker
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
index e76814a..830f3d2 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
@@ -67,7 +67,7 @@ public class TokenNameFinderToolTest {
model1.delete();
}
-
+
@Test(expected = TerminateToolException.class)
public void invalidModel() {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
new file mode 100644
index 0000000..edb37f4
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.postag;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class POSTaggerMEIT {
+
+ @Test
+ public void testPOSTagger() throws IOException {
+
+ POSTagger tagger = new POSTaggerME("en");
+
+ String[] tags = tagger.tag(new String[] {
+ "The",
+ "driver",
+ "got",
+ "badly",
+ "injured",
+ "."});
+
+ Assert.assertEquals(6, tags.length);
+ Assert.assertEquals("DT", tags[0]);
+ Assert.assertEquals("NN", tags[1]);
+ Assert.assertEquals("VBD", tags[2]);
+ Assert.assertEquals("RB", tags[3]);
+ Assert.assertEquals("VBN", tags[4]);
+ Assert.assertEquals(".", tags[5]);
+ }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEIT.java
similarity index 72%
copy from opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
copy to opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEIT.java
index 87ced1b..2ef425a 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEIT.java
@@ -15,46 +15,21 @@
* limitations under the License.
*/
-
package opennlp.tools.sentdetect;
import java.io.IOException;
-import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.Test;
-import opennlp.tools.formats.ResourceAsStreamFactory;
-import opennlp.tools.util.InputStreamFactory;
-import opennlp.tools.util.InsufficientTrainingDataException;
-import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
-import opennlp.tools.util.TrainingParameters;
-/**
- * Tests for the {@link SentenceDetectorME} class.
- */
-public class SentenceDetectorMETest {
+public class SentenceDetectorMEIT {
@Test
- public void testSentenceDetector() throws IOException {
-
- InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
- "/opennlp/tools/sentdetect/Sentences.txt");
-
- TrainingParameters mlParams = new TrainingParameters();
- mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
- mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
-
- SentenceDetectorFactory factory = new SentenceDetectorFactory("eng", true, null, null);
+ public void testSentenceDetectorDownloadModel() throws IOException {
- SentenceModel sentdetectModel = SentenceDetectorME.train(
- "eng", new SentenceSampleStream(new PlainTextByLineStream(in,
- StandardCharsets.UTF_8)), factory, mlParams);
-
- Assert.assertEquals("eng", sentdetectModel.getLanguage());
-
- SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
+ SentenceDetectorME sentDetect = new SentenceDetectorME("en");
// Tests sentence detector with sentDetect method
String sampleSentences1 = "This is a test. There are many tests, this is the second.";
@@ -136,22 +111,4 @@ public class SentenceDetectorMETest {
}
- @Test(expected = InsufficientTrainingDataException.class)
- public void testInsufficientData() throws IOException {
-
- InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
- "/opennlp/tools/sentdetect/SentencesInsufficient.txt");
-
- TrainingParameters mlParams = new TrainingParameters();
- mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
- mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
-
- SentenceDetectorFactory factory = new SentenceDetectorFactory("eng", true, null, null);
-
- SentenceDetectorME.train("eng",
- new SentenceSampleStream(
- new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams);
-
- }
-
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
index 87ced1b..93ec184 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
@@ -135,6 +135,91 @@ public class SentenceDetectorMETest {
Assert.assertEquals(new Span(16, 56), pos[1]);
}
+
+ @Test
+ public void testSentenceDetectorDownloadModel() throws IOException {
+
+ SentenceDetectorME sentDetect = new SentenceDetectorME("en");
+
+ // Tests sentence detector with sentDetect method
+ String sampleSentences1 = "This is a test. There are many tests, this is the second.";
+ String[] sents = sentDetect.sentDetect(sampleSentences1);
+ Assert.assertEquals(sents.length,2);
+ Assert.assertEquals(sents[0],"This is a test.");
+ Assert.assertEquals(sents[1],"There are many tests, this is the second.");
+ double[] probs = sentDetect.getSentenceProbabilities();
+ Assert.assertEquals(probs.length,2);
+
+ String sampleSentences2 = "This is a test. There are many tests, this is the second";
+ sents = sentDetect.sentDetect(sampleSentences2);
+ Assert.assertEquals(sents.length,2);
+ probs = sentDetect.getSentenceProbabilities();
+ Assert.assertEquals(probs.length,2);
+ Assert.assertEquals(sents[0],"This is a test.");
+ Assert.assertEquals(sents[1],"There are many tests, this is the second");
+
+ String sampleSentences3 = "This is a \"test\". He said \"There are many tests, this is the second.\"";
+ sents = sentDetect.sentDetect(sampleSentences3);
+ Assert.assertEquals(sents.length,2);
+ probs = sentDetect.getSentenceProbabilities();
+ Assert.assertEquals(probs.length,2);
+ Assert.assertEquals(sents[0],"This is a \"test\".");
+ Assert.assertEquals(sents[1],"He said \"There are many tests, this is the second.\"");
+
+ String sampleSentences4 = "This is a \"test\". I said \"This is a test.\" Any questions?";
+ sents = sentDetect.sentDetect(sampleSentences4);
+ Assert.assertEquals(sents.length,3);
+ probs = sentDetect.getSentenceProbabilities();
+ Assert.assertEquals(probs.length,3);
+ Assert.assertEquals(sents[0],"This is a \"test\".");
+ Assert.assertEquals(sents[1],"I said \"This is a test.\"");
+ Assert.assertEquals(sents[2],"Any questions?");
+
+ String sampleSentences5 = "This is a one sentence test space at the end. ";
+ sents = sentDetect.sentDetect(sampleSentences5);
+ Assert.assertEquals(1, sentDetect.getSentenceProbabilities().length);
+ Assert.assertEquals(sents[0],"This is a one sentence test space at the end.");
+
+ String sampleSentences6 = "This is a one sentences test with tab at the end. ";
+ sents = sentDetect.sentDetect(sampleSentences6);
+ Assert.assertEquals(sents[0],"This is a one sentences test with tab at the end.");
+
+ String sampleSentences7 = "This is a test. With spaces between the two sentences.";
+ sents = sentDetect.sentDetect(sampleSentences7);
+ Assert.assertEquals(sents[0],"This is a test.");
+ Assert.assertEquals(sents[1],"With spaces between the two sentences.");
+
+ String sampleSentences9 = "";
+ sents = sentDetect.sentDetect(sampleSentences9);
+ Assert.assertEquals(0, sents.length);
+
+ String sampleSentences10 = " "; // whitespaces and tabs
+ sents = sentDetect.sentDetect(sampleSentences10);
+ Assert.assertEquals(0, sents.length);
+
+ String sampleSentences11 = "This is test sentence without a dot at the end and spaces ";
+ sents = sentDetect.sentDetect(sampleSentences11);
+ Assert.assertEquals(sents[0],"This is test sentence without a dot at the end and spaces");
+ probs = sentDetect.getSentenceProbabilities();
+ Assert.assertEquals(1, probs.length);
+
+ String sampleSentence12 = " This is a test.";
+ sents = sentDetect.sentDetect(sampleSentence12);
+ Assert.assertEquals(sents[0],"This is a test.");
+
+ String sampleSentence13 = " This is a test";
+ sents = sentDetect.sentDetect(sampleSentence13);
+ Assert.assertEquals(sents[0],"This is a test");
+
+ // Test that sentPosDetect also works
+ Span[] pos = sentDetect.sentPosDetect(sampleSentences2);
+ Assert.assertEquals(pos.length,2);
+ probs = sentDetect.getSentenceProbabilities();
+ Assert.assertEquals(probs.length,2);
+ Assert.assertEquals(new Span(0, 15), pos[0]);
+ Assert.assertEquals(new Span(16, 56), pos[1]);
+
+ }
@Test(expected = InsufficientTrainingDataException.class)
public void testInsufficientData() throws IOException {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMEIT.java
new file mode 100644
index 0000000..461c7b5
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMEIT.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TokenizerMEIT {
+
+ @Test
+ public void testTokenizerDownloadedModel() throws IOException {
+
+ TokenizerME tokenizer = new TokenizerME("en");
+
+ String[] tokens = tokenizer.tokenize("test,");
+
+ Assert.assertEquals(2, tokens.length);
+ Assert.assertEquals("test", tokens[0]);
+ Assert.assertEquals(",", tokens[1]);
+ }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index a634b07..e541f3d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -56,6 +56,18 @@ public class TokenizerMETest {
}
@Test
+ public void testTokenizerDownloadedModel() throws IOException {
+
+ TokenizerME tokenizer = new TokenizerME("en");
+
+ String[] tokens = tokenizer.tokenize("test,");
+
+ Assert.assertEquals(2, tokens.length);
+ Assert.assertEquals("test", tokens[0]);
+ Assert.assertEquals(",", tokens[1]);
+ }
+
+ @Test
public void testTokenizer() throws IOException {
TokenizerModel model = TokenizerTestUtil.createMaxentTokenModel();
diff --git a/pom.xml b/pom.xml
index 3483615..f5208a6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,6 +150,7 @@
<coveralls.maven.plugin>4.3.0</coveralls.maven.plugin>
<jacoco.maven.plugin>0.7.9</jacoco.maven.plugin>
<maven.surefire.plugin>2.22.2</maven.surefire.plugin>
+ <maven.failsafe.plugin>2.22.2</maven.failsafe.plugin>
<mockito.version>3.9.0</mockito.version>
</properties>
@@ -262,11 +263,35 @@
<excludes>
<exclude>**/stemmer/*</exclude>
<exclude>**/stemmer/snowball/*</exclude>
+ <exclude>**/*IT.java</exclude>
</excludes>
</configuration>
</plugin>
<plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>${maven.failsafe.plugin}</version>
+ <executions>
+ <execution>
+ <id>integration-test</id>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <excludes>
+ <exclude>**/*Test.java</exclude>
+ </excludes>
+ <includes>
+ <include>**/*IT.java</include>
+ </includes>
+ </configuration>
+ </plugin>
+
+ <plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>2.7</version>