You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/03/28 13:56:31 UTC

[opennlp] branch master updated: OPENNLP-1318: Add automatic model downloading. (#383)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 70b6147  OPENNLP-1318: Add automatic model downloading. (#383)
70b6147 is described below

commit 70b614773d1a74aecadd7794abfdbf45b36676b7
Author: Jeff Zemerick <je...@mtnfog.com>
AuthorDate: Mon Mar 28 09:56:28 2022 -0400

    OPENNLP-1318: Add automatic model downloading. (#383)
    
    * OPENNLP-1318: Adding automatic model downloads.
---
 .../main/java/opennlp/tools/chunker/ChunkerME.java |  11 ++
 .../cmdline/namefind/TokenNameFinderTool.java      |   2 +-
 .../java/opennlp/tools/namefind/NameFinderME.java  |   4 +-
 .../java/opennlp/tools/postag/POSTaggerME.java     |  10 ++
 .../tools/sentdetect/SentenceDetectorME.java       |  11 ++
 .../java/opennlp/tools/tokenize/TokenizerME.java   |  11 ++
 .../main/java/opennlp/tools/util/DownloadUtil.java | 172 +++++++++++++++++++++
 .../java/opennlp/tools/util/model/BaseModel.java   |   1 -
 .../java/opennlp/tools/chunker/ChunkerMEIT.java    |  48 ++++++
 .../java/opennlp/tools/chunker/ChunkerMETest.java  |  10 ++
 .../tools/cmdline/TokenNameFinderToolTest.java     |   2 +-
 .../java/opennlp/tools/postag/POSTaggerMEIT.java   |  49 ++++++
 ...tectorMETest.java => SentenceDetectorMEIT.java} |  49 +-----
 .../tools/sentdetect/SentenceDetectorMETest.java   |  85 ++++++++++
 .../java/opennlp/tools/tokenize/TokenizerMEIT.java |  39 +++++
 .../opennlp/tools/tokenize/TokenizerMETest.java    |  12 ++
 pom.xml                                            |  25 +++
 17 files changed, 490 insertions(+), 51 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
index 4346df3..507b349 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
@@ -30,6 +30,7 @@ import opennlp.tools.ml.TrainerFactory.TrainerType;
 import opennlp.tools.ml.model.Event;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.ml.model.SequenceClassificationModel;
+import opennlp.tools.util.DownloadUtil;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
@@ -108,6 +109,16 @@ public class ChunkerME implements Chunker {
   }
 
   /**
+   * Initializes the chunker by downloading a default model.
+   * @param language The language of the model.
+   * @throws IOException Thrown if the model cannot be downloaded or saved.
+   */
+  public ChunkerME(String language) throws IOException {
+    this((ChunkerModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.CHUNKER,
+            ChunkerModel.class));
+  }
+
+  /**
    * Initializes the current instance with the specified model.
    * The default beam size is used.
    *
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
index 551215f..2fa9970 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
@@ -69,7 +69,7 @@ public final class TokenNameFinderTool extends BasicCmdLineTool {
 
       try {
         untokenizedLineStream = new PlainTextByLineStream(
-            new SystemInputStreamFactory(), SystemInputStreamFactory.encoding());
+                new SystemInputStreamFactory(), SystemInputStreamFactory.encoding());
         String line;
         while ((line = untokenizedLineStream.read()) != null) {
           String[] whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
index 12ce701..7a3b98a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
@@ -218,8 +218,8 @@ public class NameFinderME implements TokenNameFinder {
   }
 
   public static TokenNameFinderModel train(String languageCode, String type,
-          ObjectStream<NameSample> samples, TrainingParameters trainParams,
-          TokenNameFinderFactory factory) throws IOException {
+                                           ObjectStream<NameSample> samples, TrainingParameters trainParams,
+                                           TokenNameFinderFactory factory) throws IOException {
 
     trainParams.putIfAbsent(TrainingParameters.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
     trainParams.putIfAbsent(TrainingParameters.CUTOFF_PARAM, 0);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
index 1edcf4b..95ffffe 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
@@ -36,6 +36,7 @@ import opennlp.tools.ml.model.Event;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.ml.model.SequenceClassificationModel;
 import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.DownloadUtil;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
@@ -87,6 +88,15 @@ public class POSTaggerME implements POSTagger {
   private SequenceValidator<String> sequenceValidator;
 
   /**
+   * Initializes the sentence detector by downloading a default model.
+   * @param language The language of the POS tagger
+   * @throws IOException Thrown if the model cannot be downloaded or saved.
+   */
+  public POSTaggerME(String language) throws IOException {
+    this((POSModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.POS, POSModel.class));
+  }
+
+  /**
    * Initializes the current instance with the provided model.
    *
    * @param model
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index b5ad804..b15e08d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -32,6 +32,7 @@ import opennlp.tools.ml.TrainerFactory;
 import opennlp.tools.ml.model.Event;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.sentdetect.lang.Factory;
+import opennlp.tools.util.DownloadUtil;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.StringUtil;
@@ -79,6 +80,16 @@ public class SentenceDetectorME implements SentenceDetector {
   protected boolean useTokenEnd;
 
   /**
+   * Initializes the sentence detector by downloading a default model.
+   * @param language The language of the sentence detector.
+   * @throws IOException Thrown if the model cannot be downloaded or saved.
+   */
+  public SentenceDetectorME(String language) throws IOException {
+    this((SentenceModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.SENTENCE_DETECTOR,
+            SentenceModel.class));
+  }
+
+  /**
    * Initializes the current instance.
    *
    * @param model the {@link SentenceModel}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 6d54308..9ecdf13 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -32,6 +32,7 @@ import opennlp.tools.ml.TrainerFactory;
 import opennlp.tools.ml.model.Event;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.tokenize.lang.Factory;
+import opennlp.tools.util.DownloadUtil;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
@@ -118,6 +119,16 @@ public class TokenizerME extends AbstractTokenizer {
 
   private List<Span> newTokens;
 
+  /**
+   * Initializes the tokenizer by downloading a default model.
+   * @param language The language of the tokenizer.
+   * @throws IOException Thrown if the model cannot be downloaded or saved.
+   */
+  public TokenizerME(String language) throws IOException {
+    this((TokenizerModel) DownloadUtil.downloadModel(language, DownloadUtil.ModelType.TOKENIZER,
+            TokenizerModel.class));
+  }
+
   public TokenizerME(TokenizerModel model) {
     TokenizerFactory factory = model.getFactory();
     this.alphanumeric = factory.getAlphaNumericPattern();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
new file mode 100644
index 0000000..1969789
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * This class facilitates the downloading of pretrained OpenNLP models.
+ */
+public class DownloadUtil {
+
+  /**
+   * The type of model.
+   */
+  public enum ModelType {
+    TOKENIZER("token"),
+    SENTENCE_DETECTOR("sent"),
+    POS("pos-perceptron"),
+    NAME_FINDER("ner"),
+    CHUNKER("chunker"),
+    PARSER("parser-chunking");
+
+    private String name;
+
+    ModelType(String name) {
+      this.name = name;
+    }
+  }
+
+  private static final String baseUrl = "https://dlcdn.apache.org/opennlp/";
+
+  public static Map<String, Map<ModelType, String>> available_models = new HashMap<>();
+
+  static {
+
+    final Map<ModelType, String> frenchModels = new HashMap<>();
+    frenchModels.put(ModelType.SENTENCE_DETECTOR,
+        baseUrl + "models/ud-models-1.0/opennlp-1.0-1.9.3fr-ud-ftb-sentence-1.0-1.9.3.bin");
+    frenchModels.put(ModelType.POS,
+        baseUrl + "models/ud-models-1.0/opennlp-fr-ud-ftb-pos-1.0-1.9.3.bin");
+    frenchModels.put(ModelType.TOKENIZER,
+        baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin");
+    available_models.put("fr", frenchModels);
+
+    final Map<ModelType, String> germanModels = new HashMap<>();
+    germanModels.put(ModelType.SENTENCE_DETECTOR,
+        baseUrl + "models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin");
+    germanModels.put(ModelType.POS,
+        baseUrl + "models/ud-models-1.0/opennlp-de-ud-gsd-pos-1.0-1.9.3.bin");
+    germanModels.put(ModelType.TOKENIZER,
+        baseUrl + "models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin");
+    available_models.put("de", germanModels);
+
+    final Map<ModelType, String> englishModels = new HashMap<>();
+    englishModels.put(ModelType.SENTENCE_DETECTOR,
+        baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin");
+    englishModels.put(ModelType.POS,
+        baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-pos-1.0-1.9.3.bin");
+    englishModels.put(ModelType.TOKENIZER,
+        baseUrl + "models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin");
+    available_models.put("en", englishModels);
+
+    final Map<ModelType, String> italianModels = new HashMap<>();
+    italianModels.put(ModelType.SENTENCE_DETECTOR,
+        baseUrl + "models/ud-models-1.0/opennlp-it-ud-vit-sentence-1.0-1.9.3.bin");
+    italianModels.put(ModelType.POS,
+        baseUrl + "models/ud-models-1.0/opennlp-it-ud-vit-pos-1.0-1.9.3.bin");
+    italianModels.put(ModelType.TOKENIZER,
+        baseUrl + "models/ud-models-1.0/opennlp-it-ud-vit-sentence-1.0-1.9.3.bin");
+    available_models.put("it", italianModels);
+
+    final Map<ModelType, String> dutchModels = new HashMap<>();
+    dutchModels.put(ModelType.SENTENCE_DETECTOR,
+        baseUrl + "models/opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin");
+    dutchModels.put(ModelType.POS,
+        baseUrl + "models/ud-models-1.0/opennlp-nl-ud-alpino-pos-1.0-1.9.3.bin");
+    dutchModels.put(ModelType.TOKENIZER,
+        baseUrl + "models/ud-models-1.0/opennlp-nl-ud-alpino-tokens-1.0-1.9.3.bin");
+    available_models.put("nl", dutchModels);
+
+  }
+
+  public static BaseModel downloadModel(String language, ModelType modelType, Class type)
+          throws IOException {
+
+    if (available_models.containsKey(language)) {
+      final String url = (available_models.get(language).get(modelType));
+      if (url != null) {
+        return downloadModel(new URL(url), type);
+      }
+    }
+
+    throw new IOException("Invalid model.");
+  }
+
+  /**
+   * Downloads a model from a URL. The model is saved to an .opennlp/ directory
+   * located under the user's home directory. This directory will be created
+   * if it does not already exist. If a model to be downloaded already
+   * exists in that directory, the model will not be re-downloaded.
+   *
+   * @param url The model's URL.
+   * @return A {@link TokenNameFinderModel}.
+   * @throws IOException Thrown if the model cannot be downloaded.
+  */
+  public static BaseModel downloadModel(URL url, Class type) throws IOException {
+
+    final Path homeDirectory = Paths.get(System.getProperty("user.home") + "/.opennlp/");
+    if (!Files.isDirectory(homeDirectory)) {
+      homeDirectory.toFile().mkdir();
+    }
+
+    final String filename = url.toString().substring(url.toString().lastIndexOf("/") + 1);
+    final Path localFile = Paths.get(homeDirectory.toString(), filename);
+
+    if (!Files.exists(localFile)) {
+
+      System.out.println("Downloading model " + url + " to " + localFile);
+
+      try (final InputStream in = url.openStream()) {
+        Files.copy(in, localFile, StandardCopyOption.REPLACE_EXISTING);
+      }
+
+      System.out.println("Download complete.");
+
+    }
+
+    if (type == TokenizerModel.class) {
+      return new TokenizerModel(localFile);
+    } else if (type == ChunkerModel.class) {
+      return new ChunkerModel(localFile);
+    } else if (type == SentenceModel.class) {
+      return new SentenceModel(localFile);
+    } else if (type == POSModel.class) {
+      return new POSModel(localFile);
+    } else {
+      return new TokenNameFinderModel(localFile);
+    }
+
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
index a6a9a22..8c3d716 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
@@ -502,7 +502,6 @@ public abstract class BaseModel implements ArtifactProvider, Serializable {
    */
   public final String getManifestProperty(String key) {
     Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
-
     return manifest.getProperty(key);
   }
 
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMEIT.java
new file mode 100644
index 0000000..6d7f410
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMEIT.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.chunker;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ChunkerMEIT {
+
+  private static String[] toks1 = { "Rockwell", "said", "the", "agreement", "calls", "for",
+      "it", "to", "supply", "200", "additional", "so-called", "shipsets",
+      "for", "the", "planes", "." };
+
+  private static String[] tags1 = { "NNP", "VBD", "DT", "NN", "VBZ", "IN", "PRP", "TO", "VB",
+      "CD", "JJ", "JJ", "NNS", "IN", "DT", "NNS", "." };
+
+  private static String[] expect1 = { "B-NP", "B-VP", "B-NP", "I-NP", "B-VP", "B-SBAR",
+      "B-NP", "B-VP", "I-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-PP", "B-NP",
+      "I-NP", "O" };
+
+  @Test
+  public void downloadModel() throws IOException {
+
+    ChunkerME chunker = new ChunkerME("en");
+
+    String[] preds = chunker.chunk(toks1, tags1);
+
+    Assert.assertArrayEquals(expect1, preds);
+  }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
index b7654fb..cfbd815 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
@@ -64,6 +64,16 @@ public class ChunkerMETest {
       "B-NP", "B-VP", "I-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-PP", "B-NP",
       "I-NP", "O" };
 
+  @Test(expected = IOException.class)
+  public void downloadNonExistentModel() throws IOException {
+
+    ChunkerME chunker = new ChunkerME("en");
+
+    String[] preds = chunker.chunk(toks1, tags1);
+
+    Assert.assertArrayEquals(expect1, preds);
+  }
+
   @Before
   public void startup() throws IOException {
     // train the chunker
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
index e76814a..830f3d2 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
@@ -67,7 +67,7 @@ public class TokenNameFinderToolTest {
 
     model1.delete();
   }
-  
+
   @Test(expected = TerminateToolException.class)
   public void invalidModel() {
 
diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
new file mode 100644
index 0000000..edb37f4
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.postag;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class POSTaggerMEIT {
+
+  @Test
+  public void testPOSTagger() throws IOException {
+
+    POSTagger tagger = new POSTaggerME("en");
+
+    String[] tags = tagger.tag(new String[] {
+        "The",
+        "driver",
+        "got",
+        "badly",
+        "injured",
+        "."});
+
+    Assert.assertEquals(6, tags.length);
+    Assert.assertEquals("DT", tags[0]);
+    Assert.assertEquals("NN", tags[1]);
+    Assert.assertEquals("VBD", tags[2]);
+    Assert.assertEquals("RB", tags[3]);
+    Assert.assertEquals("VBN", tags[4]);
+    Assert.assertEquals(".", tags[5]);
+  }
+  
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEIT.java
similarity index 72%
copy from opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
copy to opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEIT.java
index 87ced1b..2ef425a 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEIT.java
@@ -15,46 +15,21 @@
  * limitations under the License.
  */
 
-
 package opennlp.tools.sentdetect;
 
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
 import org.junit.Test;
 
-import opennlp.tools.formats.ResourceAsStreamFactory;
-import opennlp.tools.util.InputStreamFactory;
-import opennlp.tools.util.InsufficientTrainingDataException;
-import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
-import opennlp.tools.util.TrainingParameters;
 
-/**
- * Tests for the {@link SentenceDetectorME} class.
- */
-public class SentenceDetectorMETest {
+public class SentenceDetectorMEIT {
 
   @Test
-  public void testSentenceDetector() throws IOException {
-
-    InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
-        "/opennlp/tools/sentdetect/Sentences.txt");
-
-    TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
-
-    SentenceDetectorFactory factory = new SentenceDetectorFactory("eng", true, null, null);
+  public void testSentenceDetectorDownloadModel() throws IOException {
 
-    SentenceModel sentdetectModel = SentenceDetectorME.train(
-        "eng", new SentenceSampleStream(new PlainTextByLineStream(in,
-            StandardCharsets.UTF_8)), factory, mlParams);
-
-    Assert.assertEquals("eng", sentdetectModel.getLanguage());
-
-    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
+    SentenceDetectorME sentDetect = new SentenceDetectorME("en");
 
     // Tests sentence detector with sentDetect method
     String sampleSentences1 = "This is a test. There are many tests, this is the second.";
@@ -136,22 +111,4 @@ public class SentenceDetectorMETest {
 
   }
   
-  @Test(expected = InsufficientTrainingDataException.class)
-  public void testInsufficientData() throws IOException {
-
-    InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
-        "/opennlp/tools/sentdetect/SentencesInsufficient.txt");
-
-    TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
-
-    SentenceDetectorFactory factory = new SentenceDetectorFactory("eng", true, null, null);
-    
-    SentenceDetectorME.train("eng",
-        new SentenceSampleStream(
-            new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams);
-    
-  }
-  
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
index 87ced1b..93ec184 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
@@ -135,6 +135,91 @@ public class SentenceDetectorMETest {
     Assert.assertEquals(new Span(16, 56), pos[1]);
 
   }
+
+  @Test
+  public void testSentenceDetectorDownloadModel() throws IOException {
+
+    SentenceDetectorME sentDetect = new SentenceDetectorME("en");
+
+    // Tests sentence detector with sentDetect method
+    String sampleSentences1 = "This is a test. There are many tests, this is the second.";
+    String[] sents = sentDetect.sentDetect(sampleSentences1);
+    Assert.assertEquals(sents.length,2);
+    Assert.assertEquals(sents[0],"This is a test.");
+    Assert.assertEquals(sents[1],"There are many tests, this is the second.");
+    double[] probs = sentDetect.getSentenceProbabilities();
+    Assert.assertEquals(probs.length,2);
+
+    String sampleSentences2 = "This is a test. There are many tests, this is the second";
+    sents = sentDetect.sentDetect(sampleSentences2);
+    Assert.assertEquals(sents.length,2);
+    probs = sentDetect.getSentenceProbabilities();
+    Assert.assertEquals(probs.length,2);
+    Assert.assertEquals(sents[0],"This is a test.");
+    Assert.assertEquals(sents[1],"There are many tests, this is the second");
+
+    String sampleSentences3 = "This is a \"test\". He said \"There are many tests, this is the second.\"";
+    sents = sentDetect.sentDetect(sampleSentences3);
+    Assert.assertEquals(sents.length,2);
+    probs = sentDetect.getSentenceProbabilities();
+    Assert.assertEquals(probs.length,2);
+    Assert.assertEquals(sents[0],"This is a \"test\".");
+    Assert.assertEquals(sents[1],"He said \"There are many tests, this is the second.\"");
+
+    String sampleSentences4 = "This is a \"test\". I said \"This is a test.\"  Any questions?";
+    sents = sentDetect.sentDetect(sampleSentences4);
+    Assert.assertEquals(sents.length,3);
+    probs = sentDetect.getSentenceProbabilities();
+    Assert.assertEquals(probs.length,3);
+    Assert.assertEquals(sents[0],"This is a \"test\".");
+    Assert.assertEquals(sents[1],"I said \"This is a test.\"");
+    Assert.assertEquals(sents[2],"Any questions?");
+
+    String sampleSentences5 = "This is a one sentence test space at the end.    ";
+    sents = sentDetect.sentDetect(sampleSentences5);
+    Assert.assertEquals(1, sentDetect.getSentenceProbabilities().length);
+    Assert.assertEquals(sents[0],"This is a one sentence test space at the end.");
+
+    String sampleSentences6 = "This is a one sentences test with tab at the end.            ";
+    sents = sentDetect.sentDetect(sampleSentences6);
+    Assert.assertEquals(sents[0],"This is a one sentences test with tab at the end.");
+
+    String sampleSentences7 = "This is a test.    With spaces between the two sentences.";
+    sents = sentDetect.sentDetect(sampleSentences7);
+    Assert.assertEquals(sents[0],"This is a test.");
+    Assert.assertEquals(sents[1],"With spaces between the two sentences.");
+
+    String sampleSentences9 = "";
+    sents = sentDetect.sentDetect(sampleSentences9);
+    Assert.assertEquals(0, sents.length);
+
+    String sampleSentences10 = "               "; // whitespaces and tabs
+    sents = sentDetect.sentDetect(sampleSentences10);
+    Assert.assertEquals(0, sents.length);
+
+    String sampleSentences11 = "This is test sentence without a dot at the end and spaces          ";
+    sents = sentDetect.sentDetect(sampleSentences11);
+    Assert.assertEquals(sents[0],"This is test sentence without a dot at the end and spaces");
+    probs = sentDetect.getSentenceProbabilities();
+    Assert.assertEquals(1, probs.length);
+
+    String sampleSentence12 = "    This is a test.";
+    sents = sentDetect.sentDetect(sampleSentence12);
+    Assert.assertEquals(sents[0],"This is a test.");
+
+    String sampleSentence13 = " This is a test";
+    sents = sentDetect.sentDetect(sampleSentence13);
+    Assert.assertEquals(sents[0],"This is a test");
+
+    // Test that sentPosDetect also works
+    Span[] pos = sentDetect.sentPosDetect(sampleSentences2);
+    Assert.assertEquals(pos.length,2);
+    probs = sentDetect.getSentenceProbabilities();
+    Assert.assertEquals(probs.length,2);
+    Assert.assertEquals(new Span(0, 15), pos[0]);
+    Assert.assertEquals(new Span(16, 56), pos[1]);
+
+  }
   
   @Test(expected = InsufficientTrainingDataException.class)
   public void testInsufficientData() throws IOException {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMEIT.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMEIT.java
new file mode 100644
index 0000000..461c7b5
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMEIT.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TokenizerMEIT {
+
+  @Test
+  public void testTokenizerDownloadedModel() throws IOException {
+
+    TokenizerME tokenizer = new TokenizerME("en");
+
+    String[] tokens = tokenizer.tokenize("test,");
+
+    Assert.assertEquals(2, tokens.length);
+    Assert.assertEquals("test", tokens[0]);
+    Assert.assertEquals(",", tokens[1]);
+  }
+  
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index a634b07..e541f3d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -56,6 +56,18 @@ public class TokenizerMETest {
   }
 
   @Test
+  public void testTokenizerDownloadedModel() throws IOException {
+
+    TokenizerME tokenizer = new TokenizerME("en");
+
+    String[] tokens = tokenizer.tokenize("test,");
+
+    Assert.assertEquals(2, tokens.length);
+    Assert.assertEquals("test", tokens[0]);
+    Assert.assertEquals(",", tokens[1]);
+  }
+
+  @Test
   public void testTokenizer() throws IOException {
     TokenizerModel model = TokenizerTestUtil.createMaxentTokenModel();
 
diff --git a/pom.xml b/pom.xml
index 3483615..f5208a6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -150,6 +150,7 @@
 		<coveralls.maven.plugin>4.3.0</coveralls.maven.plugin>
 		<jacoco.maven.plugin>0.7.9</jacoco.maven.plugin>
 		<maven.surefire.plugin>2.22.2</maven.surefire.plugin>
+		<maven.failsafe.plugin>2.22.2</maven.failsafe.plugin>
 		<mockito.version>3.9.0</mockito.version>
 	</properties>
 
@@ -262,11 +263,35 @@
 						<excludes>
 							<exclude>**/stemmer/*</exclude>
 							<exclude>**/stemmer/snowball/*</exclude>
+							<exclude>**/*IT.java</exclude>
 						</excludes>
 					</configuration>
 				</plugin>
 
 				<plugin>
+					<groupId>org.apache.maven.plugins</groupId>
+					<artifactId>maven-failsafe-plugin</artifactId>
+					<version>${maven.failsafe.plugin}</version>
+					<executions>
+						<execution>
+							<id>integration-test</id>
+							<goals>
+								<goal>integration-test</goal>
+								<goal>verify</goal>
+							</goals>
+						</execution>
+					</executions>
+					<configuration>
+						<excludes>
+							<exclude>**/*Test.java</exclude>
+						</excludes>
+						<includes>
+							<include>**/*IT.java</include>
+						</includes>
+					</configuration>
+				</plugin>
+
+				<plugin>
 					<groupId>de.thetaphi</groupId>
 					<artifactId>forbiddenapis</artifactId>
 					<version>2.7</version>