You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/12/30 12:00:18 UTC
[opennlp] branch main updated: OPENNLP-1414 Investigate why DownloadUtil can't retrieve NL models via CDN (#472)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 634c9e13 OPENNLP-1414 Investigate why DownloadUtil can't retrieve NL models via CDN (#472)
634c9e13 is described below
commit 634c9e13c64cc9e7da9f3f12e36f8b471346bedb
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Fri Dec 30 13:00:13 2022 +0100
OPENNLP-1414 Investigate why DownloadUtil can't retrieve NL models via CDN (#472)
- provides new JUnit test `DownloadUtilTest` to verify `DownloadUtil`
- provides `EnabledWhenCDNAvailable` annotation for DownloadUtilTest to execute actually downloading tests only when the Apache CDN is available
- fixes an incorrect Dutch model URL constant (incorrect path element)
- fixes an incorrect Italian model URL constant ("sentence" -> "tokens")
- fixes an incorrect French model URL constant ("en-ud-ewt2 -> "fr-ud-ftb")
- addresses review ideas by rzo1
---
.../main/java/opennlp/tools/util/DownloadUtil.java | 6 +-
.../java/opennlp/tools/util/DownloadUtilTest.java | 166 +++++++++++++++++++++
2 files changed, 169 insertions(+), 3 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
index 91f37d22..cbc36108 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java
@@ -65,7 +65,7 @@ public class DownloadUtil {
frenchModels.put(ModelType.POS,
BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-fr-ud-ftb-pos-1.0-1.9.3.bin");
frenchModels.put(ModelType.TOKENIZER,
- BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin");
+ BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-fr-ud-ftb-tokens-1.0-1.9.3.bin");
available_models.put("fr", frenchModels);
final Map<ModelType, String> germanModels = new HashMap<>();
@@ -92,12 +92,12 @@ public class DownloadUtil {
italianModels.put(ModelType.POS,
BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-it-ud-vit-pos-1.0-1.9.3.bin");
italianModels.put(ModelType.TOKENIZER,
- BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-it-ud-vit-sentence-1.0-1.9.3.bin");
+ BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-it-ud-vit-tokens-1.0-1.9.3.bin");
available_models.put("it", italianModels);
final Map<ModelType, String> dutchModels = new HashMap<>();
dutchModels.put(ModelType.SENTENCE_DETECTOR,
- BASE_URL + "models/opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin");
+ BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-nl-ud-alpino-sentence-1.0-1.9.3.bin");
dutchModels.put(ModelType.POS,
BASE_URL + MODELS_UD_MODELS_1_0 + "opennlp-nl-ud-alpino-pos-1.0-1.9.3.bin");
dutchModels.put(ModelType.TOKENIZER,
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
new file mode 100644
index 00000000..14881815
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.IOException;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.nio.file.DirectoryStream;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.extension.ConditionEvaluationResult;
+import org.junit.jupiter.api.extension.ExecutionCondition;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.api.extension.ExtensionContext;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.NullAndEmptySource;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.TokenizerModel;
+
+import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.platform.commons.util.AnnotationUtils.findAnnotation;
+
+public class DownloadUtilTest {
+
+ private static final String APACHE_CDN = "dlcdn.apache.org";
+
+ private static final int TIMEOUT_MS = 2000;
+
+ @BeforeAll
+ public static void cleanupWhenOnline() {
+ boolean isOnline;
+ try (Socket socket = new Socket()) {
+ socket.connect(new InetSocketAddress(APACHE_CDN, 80), TIMEOUT_MS);
+ isOnline = true;
+ } catch (IOException e) {
+ // Unreachable, unresolvable or timeout
+ isOnline = false;
+ }
+ // If CDN is available -> go cleanup in preparation of the actual tests
+ if (isOnline) {
+ wipeExistingModelFiles("-tokens-");
+ wipeExistingModelFiles("-sentence-");
+ }
+ }
+
+ /*
+ * Helper method that wipes out mode files if they exist on the text execution env.
+ * Those model files are wiped from the user's home hidden '.opennlp' subdirectory.
+ *
+ * Thereby, a clean download can be guaranteed - ín CDN is available and test are executed.
+ */
+ private static void wipeExistingModelFiles(final String fragment) {
+ final String openNLPHomeDir = System.getProperty("user.home") + "/.opennlp/";
+ final Path dir = FileSystems.getDefault().getPath(openNLPHomeDir);
+ if (Files.exists(dir)) {
+ try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*opennlp-*" + fragment + "*")) {
+ for (Path modelFileToWipe: stream) {
+ Files.deleteIfExists(modelFileToWipe);
+ }
+ } catch (IOException e) {
+ fail(e.getLocalizedMessage());
+ }
+ }
+ }
+
+ @ParameterizedTest(name = "Verify \"{0}\" sentence model")
+ @ValueSource(strings = {"en", "fr", "de", "it", "nl"})
+ @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+ public void testDownloadModelByLanguage(String lang) throws IOException {
+ SentenceModel model = DownloadUtil.downloadModel(lang,
+ DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
+ Assertions.assertNotNull(model);
+ Assertions.assertEquals(lang, model.getLanguage());
+ Assertions.assertTrue(model.isLoadedFromSerialized());
+ }
+
+ @ParameterizedTest(name = "Verify \"{0}\" tokenizer model")
+ @MethodSource(value = "provideURLs")
+ @EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
+ public void testDownloadModelByURL(String language, URL url) throws IOException {
+ TokenizerModel model = DownloadUtil.downloadModel(url, TokenizerModel.class);
+ Assertions.assertNotNull(model);
+ Assertions.assertEquals(language, model.getLanguage());
+ Assertions.assertTrue(model.isLoadedFromSerialized());
+ }
+
+ @ParameterizedTest(name = "Detect invalid input: \"{0}\"")
+ @NullAndEmptySource
+ @ValueSource(strings = {" ", "\t", "\n"})
+ public void testDownloadModelInvalid(String input) {
+ Assertions.assertThrows(IOException.class, () -> DownloadUtil.downloadModel(
+ input, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class),
+ "Invalid model");
+ }
+
+ private static final DownloadUtil.ModelType MT_TOKENIZER = DownloadUtil.ModelType.TOKENIZER;
+
+ // Note: This needs to be public as JUnit 5 requires it like this.
+ public static Stream<Arguments> provideURLs() {
+ return Stream.of(
+ Arguments.of("en", DownloadUtil.available_models.get("en").get(MT_TOKENIZER)),
+ Arguments.of("fr", DownloadUtil.available_models.get("fr").get(MT_TOKENIZER)),
+ Arguments.of("de", DownloadUtil.available_models.get("de").get(MT_TOKENIZER)),
+ Arguments.of("it", DownloadUtil.available_models.get("it").get(MT_TOKENIZER)),
+ Arguments.of("nl", DownloadUtil.available_models.get("nl").get(MT_TOKENIZER))
+ );
+ }
+
+ // JUnit5 execution condition to decide whether tests can assume CDN downloads are possible (= online).
+ private static class CDNAvailableCondition implements ExecutionCondition {
+
+ @Override
+ public ConditionEvaluationResult evaluateExecutionCondition(ExtensionContext context) {
+ final var optional = findAnnotation(context.getElement(), EnabledWhenCDNAvailable.class);
+ if (optional.isPresent()) {
+ final EnabledWhenCDNAvailable annotation = optional.get();
+ final String host = annotation.hostname();
+ try (Socket socket = new Socket()) {
+ socket.connect(new InetSocketAddress(host, 80), TIMEOUT_MS);
+ return ConditionEvaluationResult.enabled("CDN is reachable.");
+ } catch (IOException e) {
+ // Unreachable, unresolvable or timeout
+ return ConditionEvaluationResult.disabled("CDN is unreachable.");
+ }
+ }
+ return ConditionEvaluationResult.enabled("Nothing annotated with DisabledWhenOffline.");
+ }
+ }
+
+ // Custom JUnit5 conditional @Disabled.. annotation
+ @Retention(RetentionPolicy.RUNTIME)
+ @ExtendWith(CDNAvailableCondition.class)
+ @ParameterizedTest
+ public @interface EnabledWhenCDNAvailable {
+
+ String hostname();
+
+ }
+}