You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/03/04 12:10:40 UTC

[opennlp] 01/01: OPENNLP-1333 Write unit test for parser top "k" parses

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch OPENNLP-1333_Write_unit_test_for_parser_top_k_parses
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit d2f58caed08793f2ea6aff4c7fc3a020a8f32898
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sat Mar 4 13:10:27 2023 +0100

    OPENNLP-1333 Write unit test for parser top "k" parses
    
    - provides new test cases for `k = 1, 2, 3` for both Parser implementations
    - uses test data from https://github.com/apache/opennlp/pull/392
    - adds `toStringPennTreebank` in `Parse` to obtain a uniform string representation for verification or comparison
---
 .../src/main/java/opennlp/tools/parser/Parse.java  |  9 ++++
 .../tools/parser/AbstractParserModelTest.java      | 58 ++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
index 2e5d873c..33880baf 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
@@ -449,6 +449,15 @@ public class Parse implements Cloneable, Comparable<Parse> {
     return text.substring(span.getStart(), span.getEnd());
   }
 
+  /**
+   * @return Retrieves a String representation using Penn Treebank-style formatting.
+   */
+  public String toStringPennTreebank() {
+    StringBuffer buffer = new StringBuffer();
+    show(buffer);
+    return buffer.toString();
+  }
+
   /**
    * Represents this {@link Parse} in a human-readable way.
    */
diff --git a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
index 9df819a9..346e4eb7 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/parser/AbstractParserModelTest.java
@@ -20,6 +20,9 @@ package opennlp.tools.parser;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
 import java.util.stream.Stream;
 
 import org.junit.jupiter.api.Assertions;
@@ -27,7 +30,9 @@ import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
+import org.junit.jupiter.params.provider.ValueSource;
 
+import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.Span;
 
 /**
@@ -86,6 +91,59 @@ public abstract class AbstractParserModelTest {
     Assertions.assertNotNull(s);
   }
 
+  /*
+   * Verifies changes in OPENNLP-1330 and addresses follow-up OPENNLP-1333
+   * See: https://issues.apache.org/jira/projects/OPENNLP/issues/OPENNLP-1333
+   *
+   * Uses test data from PR 392 (https://github.com/apache/opennlp/pull/392).
+   */
+  @ParameterizedTest
+  @ValueSource(ints = {1, 2, 3})
+  void testParsingTopParses(int k) {
+    // fixtures
+    final String sent = "Eric is testing.";
+    final String refParseTopChunking =
+            "(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NP (DT testing.)))))";
+    final String refParseTopTreeInsert =
+            "(TOP (S (NP (NNP Eric)) (VP (VBZ is) (NN testing.))))";
+
+    // prepare
+    List<String> tokens = Arrays.asList(WhitespaceTokenizer.INSTANCE.tokenize(sent));
+    String text = String.join(" ", tokens);
+
+    Parse sentP = new Parse(text, new Span(0, text.length()),
+            AbstractBottomUpParser.INC_NODE, 0, 0);
+    int start = 0;
+    int i = 0;
+    for (Iterator<String> ti = tokens.iterator(); ti.hasNext(); i++) {
+      String tok = ti.next();
+      sentP.insert(new Parse(text, new Span(start, start + tok.length()),
+              AbstractBottomUpParser.TOK_NODE, 0, i));
+      start += tok.length() + 1;
+    }
+
+    opennlp.tools.parser.Parser parser = ParserFactory.create(getModel());
+    Assertions.assertNotNull(parser);
+
+    // TEST: parsing
+    Parse[] parses = parser.parse(sentP, k);
+    Assertions.assertNotNull(parses);
+    Assertions.assertEquals(k, parses.length);
+    double previousProb = 0; // initial ref value
+    for (int j = 0; j < parses.length; j++) {
+      Assertions.assertTrue(parses[j].getProb() < previousProb);
+      String asPennTreebankStyle = parses[j].toStringPennTreebank();
+      // System.out.println(parses[j].getProb() + " - " + asPennTreebankStyle);
+      if (j == 0) {
+        if (ParserType.CHUNKING.equals(getModel().getParserType())) {
+          Assertions.assertEquals(refParseTopChunking, asPennTreebankStyle);
+        } else if (ParserType.TREEINSERT.equals(getModel().getParserType())) {
+          Assertions.assertEquals(refParseTopTreeInsert, asPennTreebankStyle);
+        }
+      }
+    }
+  }
+
   /*
    * Produces a stream of <parse|text> pairs for parameterized unit tests.
    */