You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2022/12/19 08:26:50 UTC
[opennlp] branch master updated: OPENNLP-1412 Provide equals and hashCode for ParserModel and TokenizerModel (#458)

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 04048f56 OPENNLP-1412 Provide equals and hashCode for ParserModel and TokenizerModel (#458)
04048f56 is described below

commit 04048f5607a3e8c82e0aeef1dd67916d2326f43e
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Mon Dec 19 09:26:44 2022 +0100

    OPENNLP-1412 Provide equals and hashCode for ParserModel and TokenizerModel (#458)
    
    - adds specific `equals` and `hashCode` implementations for `TokenizerModel` and  `ParserModel`.
    - introduces new test dependency `junit-jupiter-params` (test-scoped) for opennlp-tools for more flexible test setups.
    - improves `TokenizerModelTest`, `treeinsert.ParserTest` and `chunking.ParserTest` by adding further assertions and fixes TODOs.
    - adds `AbstractParserModelTest` to avoid code duplication, simplifying existing test cases.
    - adds another `Parse` example to demonstrate how easy it is to check different examples without duplicating code.
    - removes uncommented code from both `ParserTest` variants (chunker/treeinsert) by providing actual assertions instead!
---
 opennlp-tools/pom.xml                              |   6 ++
 .../java/opennlp/tools/parser/ParserModel.java     |  23 +++++
 .../opennlp/tools/tokenize/TokenizerModel.java     |  23 +++++
 .../tools/parser/AbstractParserModelTest.java      | 115 +++++++++++++++++++++
 .../opennlp/tools/parser/chunking/ParserTest.java  |  52 ++++------
 .../tools/parser/treeinsert/ParserTest.java        |  51 ++++-----
 .../opennlp/tools/tokenize/TokenizerModelTest.java |  20 ++--
 pom.xml                                            |   7 ++
 8 files changed, 223 insertions(+), 74 deletions(-)

diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index d45ded45..d8e1b5a1 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -62,6 +62,12 @@
       <scope>test</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-params</artifactId>
+      <scope>test</scope>
+    </dependency>
+
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
index 465457f8..8b892261 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
@@ -367,4 +367,27 @@ public class ParserModel extends BaseModel {
       throw new InvalidFormatException("Missing the head rules!");
     }
   }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(artifactMap.get(MANIFEST_ENTRY),
+            artifactMap.get(PARSER_TAGGER_MODEL_ENTRY_NAME));
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == this) {
+      return true;
+    }
+
+    if (obj instanceof ParserModel) {
+      ParserModel model = (ParserModel) obj;
+      Map<String, Object> artifactMapToCheck = model.artifactMap;
+      AbstractModel abstractModel = (AbstractModel) artifactMapToCheck.get(BUILD_MODEL_ENTRY_NAME);
+
+      return artifactMap.get(MANIFEST_ENTRY).equals(artifactMapToCheck.get(MANIFEST_ENTRY)) &&
+              artifactMap.get(BUILD_MODEL_ENTRY_NAME).equals(abstractModel);
+    }
+    return false;
+  }
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
index b2d5003f..201107aa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
@@ -24,6 +24,7 @@ import java.io.InputStream;
 import java.net.URL;
 import java.nio.file.Path;
 import java.util.Map;
+import java.util.Objects;
 
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.ml.model.AbstractModel;
@@ -162,4 +163,26 @@ public final class TokenizerModel extends BaseModel {
   public boolean useAlphaNumericOptimization() {
     return getFactory() != null && getFactory().isUseAlphaNumericOptmization();
   }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(artifactMap.get(MANIFEST_ENTRY), artifactMap.get(TOKENIZER_MODEL_ENTRY));
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == this) {
+      return true;
+    }
+
+    if (obj instanceof TokenizerModel) {
+      TokenizerModel model = (TokenizerModel) obj;
+      Map<String, Object> artifactMapToCheck = model.artifactMap;
+      AbstractModel abstractModel = (AbstractModel) artifactMapToCheck.get(TOKENIZER_MODEL_ENTRY);
+
+      return artifactMap.get(MANIFEST_ENTRY).equals(artifactMapToCheck.get(MANIFEST_ENTRY)) &&
+              artifactMap.get(TOKENIZER_MODEL_ENTRY).equals(abstractModel);
+    }
+    return false;
+  }
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
new file mode 100644
index 00000000..9df819a9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import opennlp.tools.util.Span;
+
+/**
+ * Common test class for {@link ParserModel}-driven test cases.
+ */
+public abstract class AbstractParserModelTest {
+
+  /**
+   * @return Retrieves a valid {@link ParserModel}, either trained or loaded.
+   */
+  protected abstract ParserModel getModel();
+
+  /**
+   * Verifies that serialization of {@link ParserModel} equals trained state.
+   * <p>
+   * Tests {@link ParserModel#equals(Object)}.
+   */
+  @Test
+  void testModelSerializationAndEquality() throws IOException {
+    Assertions.assertNotNull(getModel());
+    Assertions.assertFalse(getModel().isLoadedFromSerialized());
+
+    // Test serializing and de-serializing model
+    ByteArrayOutputStream outArray = new ByteArrayOutputStream();
+    getModel().serialize(outArray);
+    outArray.close();
+
+    // TEST: de-serialization and equality
+    ParserModel outputModel = new ParserModel(new ByteArrayInputStream(outArray.toByteArray()));
+    Assertions.assertNotNull(outputModel);
+    Assertions.assertTrue(outputModel.isLoadedFromSerialized());
+    Assertions.assertEquals(getModel(), outputModel);
+  }
+
+  /**
+   * Verifies that parsing with a {@link ParserModel} does not cause problems at runtime.
+   */
+  @ParameterizedTest(name = "Parse example {index}.")
+  @MethodSource("provideParsePairs")
+  void testParsing(String input, String reference) {
+    // prepare
+    Assertions.assertNotNull(getModel());
+    Parse p = Parse.parseParse(input);
+    Assertions.assertNotNull(p);
+    Assertions.assertTrue(p.complete());
+    Assertions.assertEquals(reference, p.getText());
+    opennlp.tools.parser.Parser parser = ParserFactory.create(getModel());
+    Assertions.assertNotNull(parser);
+
+    // TEST: parsing
+    Parse parsedViaParser = parser.parse(p);
+    Assertions.assertNotNull(parsedViaParser);
+    Assertions.assertTrue(parsedViaParser.complete());
+    Assertions.assertEquals(reference, p.getText());
+    Span s = parsedViaParser.getSpan();
+    Assertions.assertNotNull(s);
+  }
+
+  /*
+   * Produces a stream of <parse|text> pairs for parameterized unit tests.
+   */
+  private static Stream<Arguments> provideParsePairs() {
+    return Stream.of(
+            // Example 1: with eos character
+            Arguments.of("(TOP  "
+                        + "(S (S (NP-SBJ (PRP She)  )(VP (VBD was)  "
+                        + "(ADVP (RB just)  )(NP-PRD (NP (DT another)  (NN freighter)  )"
+                        + "(PP (IN from)  (NP (DT the)  (NNPS States)  )))))(, ,)  "
+                        + "(CC and) "
+                        + "(S (NP-SBJ (PRP she)  )(VP (VBD seemed)  "
+                        + "(ADJP-PRD (ADJP (RB as)  (JJ commonplace)  )(PP (IN as)  (NP (PRP$ her)  "
+                        + "(NN name)  )))))(. .)  ))",
+                        "She was just another freighter from the States , " +
+                        "and she seemed as commonplace as her name . "),
+            // Example 2: without eos character
+            Arguments.of("(S  "
+                        + "(PP (IN On) (NP (NNP June) (CD 16))) "
+                        + "(NP (PRP he))"
+                        + "(VP (VBD was) (VP (VBN born) "
+                        + "(PP in (NP Germany)))))",
+                        "On June 16 he was born Germany ")
+    ) ;
+  }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/chunking/ParserTest.java b/opennlp-tools/src/test/java/opennlp/tools/parser/chunking/ParserTest.java
index e82410e9..37ae93e0 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/parser/chunking/ParserTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/parser/chunking/ParserTest.java
@@ -17,52 +17,40 @@
 
 package opennlp.tools.parser.chunking;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
 
+import opennlp.tools.parser.AbstractParserModelTest;
 import opennlp.tools.parser.HeadRules;
 import opennlp.tools.parser.Parse;
-import opennlp.tools.parser.ParserFactory;
 import opennlp.tools.parser.ParserModel;
 import opennlp.tools.parser.ParserTestUtil;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 
 /**
- * Tests for the {@link Parser} class.
+ * Tests for the {@link opennlp.tools.parser.chunking.Parser} class.
  */
-public class ParserTest {
+public class ParserTest extends AbstractParserModelTest {
 
-  /**
-   * Verify that training and tagging does not cause
-   * runtime problems.
-   */
-  @Test
-  void testChunkingParserTraining() throws Exception {
+  /* Trained dynamically before test */
+  private static ParserModel model;
 
+  @Override
+  protected ParserModel getModel() {
+    return model;
+  }
+
+  @BeforeAll
+  public static void setupEnvironment() throws IOException {
     ObjectStream<Parse> parseSamples = ParserTestUtil.openTestTrainingData();
     HeadRules headRules = ParserTestUtil.createTestHeadRules();
-
-    ParserModel model = Parser.train("eng", parseSamples, headRules,
-        TrainingParameters.defaultParams());
-
-    opennlp.tools.parser.Parser parser = ParserFactory.create(model);
-
-    // TODO:
-    // Tests parsing to make sure the code does not has
-    // a bug which fails always with a runtime exception
-    // parser.parse(Parse.parseParse("She was just another freighter from the " +
-    // "States and she seemed as commonplace as her name ."));
-
-    // Test serializing and de-serializing model
-    ByteArrayOutputStream outArray = new ByteArrayOutputStream();
-    model.serialize(outArray);
-    outArray.close();
-
-    ParserModel outputModel = new ParserModel(new ByteArrayInputStream(outArray.toByteArray()));
-
-    // TODO: compare both models
+    // Training an English lang 'opennlp.tools.parser.chunking.Parse'
+    model = Parser.train("eng", parseSamples, headRules, TrainingParameters.defaultParams());
+    Assertions.assertNotNull(model);
+    Assertions.assertFalse(model.isLoadedFromSerialized());
   }
+
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/treeinsert/ParserTest.java b/opennlp-tools/src/test/java/opennlp/tools/parser/treeinsert/ParserTest.java
index f851f620..ff5c16c4 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/parser/treeinsert/ParserTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/parser/treeinsert/ParserTest.java
@@ -17,49 +17,40 @@
 
 package opennlp.tools.parser.treeinsert;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 
-import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeAll;
 
+import opennlp.tools.parser.AbstractParserModelTest;
 import opennlp.tools.parser.HeadRules;
 import opennlp.tools.parser.Parse;
-import opennlp.tools.parser.ParserFactory;
 import opennlp.tools.parser.ParserModel;
 import opennlp.tools.parser.ParserTestUtil;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
 
 /**
- * Tests for the {@link Parser} class.
+ * Tests for the {@link opennlp.tools.parser.treeinsert.Parser} class.
  */
-public class ParserTest {
+public class ParserTest extends AbstractParserModelTest {
 
-  /**
-   * Verify that training and tagging does not cause
-   * runtime problems.
-   */
-  @Test
-  void testTreeInsertParserTraining() throws Exception {
+  /* Trained dynamically before test */
+  private static ParserModel model;
 
+  @Override
+  protected ParserModel getModel() {
+    return model;
+  }
+  
+  @BeforeAll
+  public static void setupEnvironment() throws IOException {
     ObjectStream<Parse> parseSamples = ParserTestUtil.openTestTrainingData();
     HeadRules headRules = ParserTestUtil.createTestHeadRules();
-
-    ParserModel model = Parser.train("eng", parseSamples, headRules, 100, 0);
-
-    opennlp.tools.parser.Parser parser = ParserFactory.create(model);
-
-    // Tests parsing to make sure the code does not has
-    // a bug which fails always with a runtime exception
-    parser.parse(Parse.parseParse("She was just another freighter from the " +
-        "States and she seemed as commonplace as her name ."));
-
-    // Test serializing and de-serializing model
-    ByteArrayOutputStream outArray = new ByteArrayOutputStream();
-    model.serialize(outArray);
-    outArray.close();
-
-    new ParserModel(new ByteArrayInputStream(outArray.toByteArray()));
-
-    // TODO: compare both models
+    // Training an English lang 'opennlp.tools.parser.treeinsert.Parser'
+    model = Parser.train("eng", parseSamples, headRules, TrainingParameters.defaultParams());
+    Assertions.assertNotNull(model);
+    Assertions.assertFalse(model.isLoadedFromSerialized());
   }
+
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerModelTest.java
index a0a715eb..584657fa 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerModelTest.java
@@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 
+import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
 /**
@@ -29,24 +30,19 @@ import org.junit.jupiter.api.Test;
 public class TokenizerModelTest {
 
   @Test
-  void testSentenceModel() throws IOException {
+  void testTokenizerModelSerialization() throws IOException {
 
     TokenizerModel model = TokenizerTestUtil.createSimpleMaxentTokenModel();
+    Assertions.assertFalse(model.isLoadedFromSerialized());
 
     ByteArrayOutputStream arrayOut = new ByteArrayOutputStream();
     model.serialize(arrayOut);
     arrayOut.close();
 
-    model = new TokenizerModel(new ByteArrayInputStream(arrayOut.toByteArray()));
-    // TODO: check that both maxent models are equal
-
-    // Also test serialization after building model from an inputstream
-    arrayOut = new ByteArrayOutputStream();
-    model.serialize(arrayOut);
-    arrayOut.close();
-
-    new TokenizerModel(new ByteArrayInputStream(arrayOut.toByteArray()));
-
-    // TODO: check that both maxent models are equal
+    TokenizerModel modelRestored = new TokenizerModel(new ByteArrayInputStream(arrayOut.toByteArray()));
+    Assertions.assertNotNull(modelRestored);
+    Assertions.assertTrue(modelRestored.isLoadedFromSerialized());
+    Assertions.assertEquals(model, modelRestored);
+    
   }
 }
diff --git a/pom.xml b/pom.xml
index 5c60c87f..84fb8287 100644
--- a/pom.xml
+++ b/pom.xml
@@ -108,6 +108,13 @@
 				<scope>test</scope>
 			</dependency>
 
+			<dependency>
+				<groupId>org.junit.jupiter</groupId>
+				<artifactId>junit-jupiter-params</artifactId>
+				<version>${junit.version}</version>
+				<scope>test</scope>
+			</dependency>
+
 			<dependency>
 				<artifactId>opennlp-tools</artifactId>
 				<groupId>${project.groupId}</groupId>