You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2018/04/16 14:34:06 UTC

[opennlp] branch master updated: [OPENNLP-941] Added eval support to detokenizer

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new b50aefc  [OPENNLP-941] Added eval support to detokenizer
b50aefc is described below

commit b50aefc261156f289d0cda76c357150491adbd67
Author: Suraj Krishnan Rajan <su...@gmail.com>
AuthorDate: Wed Apr 11 16:23:37 2018 +0530

    [OPENNLP-941] Added eval support to detokenizer
---
 .../tokenizer/DetokenEvaluationErrorListener.java  | 53 ++++++++++++++
 .../tools/tokenize/DetokenizerEvaluator.java       | 79 +++++++++++++++++++++
 .../tools/tokenize/DetokenizerEvaluatorTest.java   | 81 ++++++++++++++++++++++
 .../opennlp/tools/tokenize/TokenSampleTest.java    | 29 ++++----
 4 files changed, 230 insertions(+), 12 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/DetokenEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/DetokenEvaluationErrorListener.java
new file mode 100644
index 0000000..4e31332
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/DetokenEvaluationErrorListener.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.tokenizer;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.EvaluationErrorPrinter;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.tokenize.TokenizerEvaluationMonitor;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * A default implementation of {@link EvaluationMonitor} that prints
+ * to an output stream.
+ */
+public class DetokenEvaluationErrorListener extends
+    EvaluationErrorPrinter<TokenSample> implements TokenizerEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to System.err
+   */
+  public DetokenEvaluationErrorListener() {
+    super(System.err);
+  }
+
+  /**
+   * Creates a listener that will print to a given {@link OutputStream}
+   */
+  public DetokenEvaluationErrorListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  @Override
+  public void missclassified(TokenSample reference, TokenSample prediction) {
+    printError(reference, prediction);
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
new file mode 100644
index 0000000..7d9df4f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/DetokenizerEvaluator.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize;
+
+
+import java.util.ArrayList;
+
+import opennlp.tools.cmdline.tokenizer.DetokenEvaluationErrorListener;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.FMeasure;
+
+/**
+ * The {@link DetokenizerEvaluator} measures the performance of
+ * the given {@link Detokenizer} with the provided reference
+ * {@link TokenSample}s.
+ *
+ * @see DetokenizerEvaluator
+ * @see Detokenizer
+ * @see TokenSample
+ */
+
+public class DetokenizerEvaluator extends Evaluator<TokenSample> {
+  private FMeasure fmeasure = new FMeasure();
+
+  /**
+   * The {@link Detokenizer} used to create the
+   * predicted tokens.
+   */
+  private Detokenizer detokenizer;
+
+  /**
+   * Initializes the current instance with the
+   * given {@link Detokenizer}.
+   *
+   * @param detokenizer the {@link Detokenizer} to evaluate.
+   * @param listeners   evaluation sample listeners
+   */
+  public DetokenizerEvaluator(Detokenizer detokenizer, DetokenEvaluationErrorListener... listeners) {
+    super(listeners);
+    this.detokenizer = detokenizer;
+  }
+
+  @Override
+  protected TokenSample processSample(TokenSample reference) {
+    String[] tokens = Span.spansToStrings(reference.getTokenSpans(), reference.getText());
+    String tokensstring = detokenizer.detokenize(tokens, null);
+
+    ArrayList<String> predictionsArray = new ArrayList<>();
+    ArrayList<String> referencesArray = new ArrayList<>();
+
+    predictionsArray.add(tokensstring);
+    referencesArray.add(reference.getText());
+
+    Object[] references = referencesArray.toArray();
+    Object[] predictions = predictionsArray.toArray();
+    fmeasure.updateScores(references, predictions);
+
+    return new TokenSample(tokensstring, reference.getTokenSpans());
+  }
+
+  public FMeasure getFMeasure() {
+    return fmeasure;
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/DetokenizerEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/DetokenizerEvaluatorTest.java
new file mode 100644
index 0000000..8388cb9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/DetokenizerEvaluatorTest.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStream;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.cmdline.tokenizer.DetokenEvaluationErrorListener;
+import opennlp.tools.util.InvalidFormatException;
+
+
+public class DetokenizerEvaluatorTest {
+  @Test
+  public void testPositive() throws InvalidFormatException {
+    OutputStream stream = new ByteArrayOutputStream();
+    DetokenEvaluationErrorListener listener = new DetokenEvaluationErrorListener(stream);
+
+    DetokenizerEvaluator eval = new DetokenizerEvaluator(new DummyDetokenizer(
+        TokenSampleTest.createGoldSample()), listener);
+
+    eval.evaluateSample(TokenSampleTest.createGoldSample());
+
+    Assert.assertEquals(1.0, eval.getFMeasure().getFMeasure(), 0.0);
+
+    Assert.assertEquals(0, stream.toString().length());
+  }
+
+  @Test
+  public void testNegative() throws InvalidFormatException {
+    OutputStream stream = new ByteArrayOutputStream();
+    DetokenEvaluationErrorListener listener = new DetokenEvaluationErrorListener(
+        stream);
+
+    DetokenizerEvaluator eval = new DetokenizerEvaluator(new DummyDetokenizer(
+        TokenSampleTest.createGoldSample()), listener);
+
+    eval.evaluateSample(TokenSampleTest.createPredSilverSample());
+
+    Assert.assertEquals(-1.0d, eval.getFMeasure().getFMeasure(), .1d);
+
+    Assert.assertNotSame(0, stream.toString().length());
+  }
+
+  /**
+   * a dummy tokenizer that always return something expected
+   */
+  class DummyDetokenizer implements Detokenizer {
+
+    private TokenSample sample;
+
+    public DummyDetokenizer(TokenSample sample) {
+      this.sample = sample;
+    }
+
+    public DetokenizationOperation[] detokenize(String[] tokens) {
+      return null;
+    }
+
+    public String detokenize(String[] tokens, String splitMarker) {
+      return this.sample.getText();
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenSampleTest.java
index 9b53c8b..b1dbba2 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenSampleTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenSampleTest.java
@@ -32,12 +32,27 @@ import opennlp.tools.util.Span;
 
 public class TokenSampleTest {
 
+  public static TokenSample createGoldSample() {
+    return new TokenSample("A test.", new Span[] {new Span(0, 1),
+        new Span(2, 6)});
+  }
+
+  public static TokenSample createPredSample() {
+    return new TokenSample("A test.", new Span[] {new Span(0, 3),
+        new Span(2, 6)});
+  }
+
+  public static TokenSample createPredSilverSample() {
+    return new TokenSample("A t st.", new Span[] {new Span(0, 1),
+        new Span(2, 6)});
+  }
+
   @Test
   public void testRetrievingContent() {
 
     String sentence = "A test";
 
-    TokenSample sample = new TokenSample(sentence, new Span[]{new Span(0, 1),
+    TokenSample sample = new TokenSample(sentence, new Span[] {new Span(0, 1),
         new Span(2, 6)});
 
     Assert.assertEquals("A test", sample.getText());
@@ -75,7 +90,7 @@ public class TokenSampleTest {
 
     Detokenizer detokenizer = DictionaryDetokenizerTest.createLatinDetokenizer();
 
-    String[] tokens = new String[]{
+    String[] tokens = new String[] {
         "start",
         "(", // move right
         ")", // move left
@@ -116,14 +131,4 @@ public class TokenSampleTest {
     Assert.assertFalse(createPredSample().equals(createGoldSample()));
     Assert.assertFalse(createPredSample().equals(new Object()));
   }
-
-  public static TokenSample createGoldSample() {
-    return new TokenSample("A test.", new Span[] { new Span(0, 1),
-        new Span(2, 6) });
-  }
-
-  public static TokenSample createPredSample() {
-    return new TokenSample("A test.", new Span[] { new Span(0, 3),
-        new Span(2, 6) });
-  }
 }

-- 
To stop receiving notification emails like this one, please contact
joern@apache.org.