You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/12/30 12:00:18 UTC

[opennlp] branch main updated: OPENNLP-1414 Investigate why DownloadUtil can't retrieve NL models via CDN (#472)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new 634c9e13 OPENNLP-1414 Investigate why DownloadUtil can't retrieve NL models via CDN (#472)
634c9e13 is described below

commit 634c9e13c64cc9e7da9f3f12e36f8b471346bedb
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Fri Dec 30 13:00:13 2022 +0100

    OPENNLP-1414 Investigate why DownloadUtil can't retrieve NL models via CDN (#472)
    
    - provides new JUnit test `DownloadUtilTest` to verify `DownloadUtil`
    - provides `EnabledWhenCDNAvailable` annotation for DownloadUtilTest to execute actually downloading tests only when the Apache CDN is available
    - fixes an incorrect Dutch model URL constant (incorrect path element)
    - fixes an incorrect Italian model URL constant ("sentence" -> "tokens")
    - fixes an incorrect French model URL constant ("en-ud-ewt2 -> "fr-ud-ftb")
    - addresses review ideas by rzo1
---
 .../main/java/opennlp/tools/util/DownloadUtil.java |   6 +-
 .../java/opennlp/tools/util/DownloadUtilTest.java  | 166 +++++++++++++++++++++
 2 files changed, 169 insertions(+), 3 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
index 91f37d22..cbc36108 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -65,7 +65,7 @@ public class DownloadUtil {
     frenchModels.put(ModelType.POS,
         BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-fr-ud-ftb-pos-1.0-1.9.3.bin");
     frenchModels.put(ModelType.TOKENIZER,
-        BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin");
+        BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-fr-ud-ftb-tokens-1.0-1.9.3.bin");
     available_models.put("fr", frenchModels);
 
     final Map<ModelType, String> germanModels = new HashMap<>();
@@ -92,12 +92,12 @@ public class DownloadUtil {
     italianModels.put(ModelType.POS,
         BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-it-ud-vit-pos-1.0-1.9.3.bin");
     italianModels.put(ModelType.TOKENIZER,
-        BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-it-ud-vit-sentence-1.0-1.9.3.bin");
+        BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-it-ud-vit-tokens-1.0-1.9.3.bin");
     available_models.put("it", italianModels);
 
     final Map<ModelType, String> dutchModels = new HashMap<>();
     dutchModels.put(ModelType.SENTENCE_DETECTOR,
-        BASE_URL + "models/opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin");
+        BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin");
     dutchModels.put(ModelType.POS,
         BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-nl-ud-alpino-pos-1.0-1.9.3.bin");
     dutchModels.put(ModelType.TOKENIZER,
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
new file mode 100644
index 00000000..14881815
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.IOException;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.nio.file.DirectoryStream;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.extension.ConditionEvaluationResult;
+import org.junit.jupiter.api.extension.ExecutionCondition;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.api.extension.ExtensionContext;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.NullAndEmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.TokenizerModel;
+
+import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.platform.commons.util.AnnotationUtils.findAnnotation;
+
+public class DownloadUtilTest {
+
+  private static final String APACHE_CDN = "dlcdn.apache.org";
+
+  private static final int TIMEOUT_MS = 2000;
+
+  @BeforeAll
+  public static void cleanupWhenOnline() {
+    boolean isOnline;
+    try (Socket socket = new Socket()) {
+      socket.connect(new InetSocketAddress(APACHE_CDN, 80), TIMEOUT_MS);
+      isOnline = true;
+    } catch (IOException e) {
+      // Unreachable, unresolvable or timeout
+      isOnline = false;
+    }
+    // If CDN is available -> go cleanup in preparation of the actual tests
+    if (isOnline) {
+      wipeExistingModelFiles("-tokens-");
+      wipeExistingModelFiles("-sentence-");
+    }
+  }
+
+  /*
+   * Helper method that wipes out mode files if they exist on the text execution env.
+   * Those model files are wiped from the user's home hidden '.opennlp' subdirectory.
+   *
+   * Thereby, a clean download can be guaranteed - ín CDN is available and test are executed.
+   */
+  private static void wipeExistingModelFiles(final String fragment) {
+    final String openNLPHomeDir = System.getProperty("user.home") + "/.opennlp/";
+    final Path dir = FileSystems.getDefault().getPath(openNLPHomeDir);
+    if (Files.exists(dir)) {
+      try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*opennlp-*" + fragment + "*")) {
+        for (Path modelFileToWipe: stream) {
+          Files.deleteIfExists(modelFileToWipe);
+        }
+      } catch (IOException e) {
+        fail(e.getLocalizedMessage());
+      }
+    }
+  }
+
+  @ParameterizedTest(name = "Verify \"{0}\" sentence model")
+  @ValueSource(strings = {"en", "fr", "de", "it", "nl"})
+  @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+  public void testDownloadModelByLanguage(String lang) throws IOException {
+    SentenceModel model = DownloadUtil.downloadModel(lang,
+            DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
+    Assertions.assertNotNull(model);
+    Assertions.assertEquals(lang, model.getLanguage());
+    Assertions.assertTrue(model.isLoadedFromSerialized());
+  }
+
+  @ParameterizedTest(name = "Verify \"{0}\" tokenizer model")
+  @MethodSource(value = "provideURLs")
+  @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+  public void testDownloadModelByURL(String language, URL url) throws IOException {
+    TokenizerModel model = DownloadUtil.downloadModel(url, TokenizerModel.class);
+    Assertions.assertNotNull(model);
+    Assertions.assertEquals(language, model.getLanguage());
+    Assertions.assertTrue(model.isLoadedFromSerialized());
+  }
+
+  @ParameterizedTest(name = "Detect invalid input: \"{0}\"")
+  @NullAndEmptySource
+  @ValueSource(strings = {" ", "\t", "\n"})
+  public void testDownloadModelInvalid(String input) {
+    Assertions.assertThrows(IOException.class, () -> DownloadUtil.downloadModel(
+                    input, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class),
+            "Invalid model");
+  }
+
+  private static final DownloadUtil.ModelType MT_TOKENIZER = DownloadUtil.ModelType.TOKENIZER;
+
+  // Note: This needs to be public as JUnit 5 requires it like this.
+  public static Stream<Arguments> provideURLs() {
+    return Stream.of(
+            Arguments.of("en", DownloadUtil.available_models.get("en").get(MT_TOKENIZER)),
+            Arguments.of("fr", DownloadUtil.available_models.get("fr").get(MT_TOKENIZER)),
+            Arguments.of("de", DownloadUtil.available_models.get("de").get(MT_TOKENIZER)),
+            Arguments.of("it", DownloadUtil.available_models.get("it").get(MT_TOKENIZER)),
+            Arguments.of("nl", DownloadUtil.available_models.get("nl").get(MT_TOKENIZER))
+    );
+  }
+
+  // JUnit5 execution condition to decide whether tests can assume CDN downloads are possible (= online).
+  private static class CDNAvailableCondition implements ExecutionCondition {
+
+    @Override
+    public ConditionEvaluationResult evaluateExecutionCondition(ExtensionContext context) {
+      final var optional = findAnnotation(context.getElement(), EnabledWhenCDNAvailable.class);
+      if (optional.isPresent()) {
+        final EnabledWhenCDNAvailable annotation = optional.get();
+        final String host = annotation.hostname();
+        try (Socket socket = new Socket()) {
+          socket.connect(new InetSocketAddress(host, 80), TIMEOUT_MS);
+          return ConditionEvaluationResult.enabled("CDN is reachable.");
+        } catch (IOException e) {
+          // Unreachable, unresolvable or timeout
+          return ConditionEvaluationResult.disabled("CDN is unreachable.");
+        }
+      }
+      return ConditionEvaluationResult.enabled("Nothing annotated with DisabledWhenOffline.");
+    }
+  }
+
+  // Custom JUnit5 conditional @Disabled.. annotation
+  @Retention(RetentionPolicy.RUNTIME)
+  @ExtendWith(CDNAvailableCondition.class)
+  @ParameterizedTest
+  public @interface EnabledWhenCDNAvailable {
+
+    String hostname();
+
+  }
+}