You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/05/17 15:52:43 UTC
[17/50] opennlp git commit: OPENNLP-1040: Add OntoNotes4 training
data verification
OPENNLP-1040: Add OntoNotes4 training data verification
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/40602173
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/40602173
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/40602173
Branch: refs/heads/LangDetect
Commit: 406021733baf6cdd339d7b14a413b2ffeeaae42d
Parents: 32afb6a
Author: Jörn Kottmann <jo...@apache.org>
Authored: Fri Apr 21 12:57:19 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon Apr 24 12:49:20 2017 +0200
----------------------------------------------------------------------
.../tools/eval/OntoNotes4NameFinderEval.java | 56 +++++++++++++++-----
.../tools/eval/OntoNotes4ParserEval.java | 45 ++++++++++++----
.../tools/eval/OntoNotes4PosTaggerEval.java | 45 ++++++++++++----
3 files changed, 116 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index e0e3912..ef018cd 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -19,9 +19,13 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.formats.DirectorySampleStream;
@@ -37,9 +41,7 @@ import opennlp.tools.util.model.ModelUtil;
public class OntoNotes4NameFinderEval {
- private static void crossEval(TrainingParameters params, String type, double expectedScore)
- throws IOException {
-
+ private static ObjectStream<NameSample> createNameSampleStream() throws IOException {
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
file -> {
@@ -50,19 +52,49 @@ public class OntoNotes4NameFinderEval {
return file.isDirectory();
}, true);
- ObjectStream<NameSample> samples = new OntoNotesNameSampleStream(new FileToStringSampleStream(
- documentStream, Charset.forName("UTF-8")));
+ return new OntoNotesNameSampleStream(new FileToStringSampleStream(
+ documentStream, StandardCharsets.UTF_8));
+ }
+
+ private static void crossEval(TrainingParameters params, String type, double expectedScore)
+ throws IOException {
+ try (ObjectStream<NameSample> samples = createNameSampleStream()) {
- TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null,
- params, new TokenNameFinderFactory());
+ TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null,
+ params, new TokenNameFinderFactory());
- if (type != null) {
- samples = new NameSampleTypeFilter(new String[] {type}, samples);
+ ObjectStream<NameSample> filteredSamples;
+ if (type != null) {
+ filteredSamples = new NameSampleTypeFilter(new String[] {type}, samples);
+ }
+ else {
+ filteredSamples = samples;
+ }
+
+ cv.evaluate(filteredSamples, 10);
+
+ Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+ }
+ }
+
+ @BeforeClass
+ public static void verifyTrainingData() throws IOException {
+ MessageDigest digest;
+ try {
+ digest = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalStateException(e);
}
- cv.evaluate(samples, 10);
+ try (ObjectStream<NameSample> samples = createNameSampleStream()) {
+ NameSample sample;
+ while ((sample = samples.read()) != null) {
+ digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+ }
- Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+ Assert.assertEquals(new BigInteger("168206908604555450993491898907821588182"),
+ new BigInteger(1, digest.digest()));
+ }
}
@Test
http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index 2182957..3a5b30d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -21,9 +21,13 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.formats.DirectorySampleStream;
@@ -31,6 +35,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream;
import opennlp.tools.formats.ontonotes.DocumentToLineStream;
import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream;
import opennlp.tools.parser.HeadRules;
+import opennlp.tools.parser.Parse;
import opennlp.tools.parser.ParserCrossValidator;
import opennlp.tools.parser.ParserType;
import opennlp.tools.parser.lang.en.HeadRulesTest;
@@ -40,9 +45,7 @@ import opennlp.tools.util.model.ModelUtil;
public class OntoNotes4ParserEval {
- private static void crossEval(TrainingParameters params, HeadRules rules, double expectedScore)
- throws IOException {
-
+ private static ObjectStream<Parse> createParseSampleStream() throws IOException {
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
file -> {
@@ -53,15 +56,39 @@ public class OntoNotes4ParserEval {
return file.isDirectory();
}, true);
- OntoNotesParseSampleStream samples = new OntoNotesParseSampleStream(
+ return new OntoNotesParseSampleStream(
new DocumentToLineStream(new FileToStringSampleStream(
- documentStream, Charset.forName("UTF-8"))));
+ documentStream, StandardCharsets.UTF_8)));
+ }
+
+ private static void crossEval(TrainingParameters params, HeadRules rules, double expectedScore)
+ throws IOException {
+ try (ObjectStream<Parse> samples = createParseSampleStream()) {
+ ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING);
+ cv.evaluate(samples, 10);
+
+ Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+ }
+ }
- ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING);
+ @BeforeClass
+ public static void verifyTrainingData() throws IOException {
+ MessageDigest digest;
+ try {
+ digest = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalStateException(e);
+ }
- cv.evaluate(samples, 10);
+ try (ObjectStream<Parse> samples = createParseSampleStream()) {
+ Parse sample;
+ while ((sample = samples.read()) != null) {
+ digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+ }
- Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+ Assert.assertEquals(new BigInteger("83833369887442127665956850482411800415"),
+ new BigInteger(1, digest.digest()));
+ }
}
@Test
http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index ab33568..b171978 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -19,9 +19,13 @@ package opennlp.tools.eval;
import java.io.File;
import java.io.IOException;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import org.junit.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
import opennlp.tools.formats.DirectorySampleStream;
@@ -29,6 +33,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream;
import opennlp.tools.formats.convert.ParseToPOSSampleStream;
import opennlp.tools.formats.ontonotes.DocumentToLineStream;
import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream;
+import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.POSTaggerCrossValidator;
import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.util.ObjectStream;
@@ -37,9 +42,7 @@ import opennlp.tools.util.model.ModelUtil;
public class OntoNotes4PosTaggerEval {
- private static void crossEval(TrainingParameters params, double expectedScore)
- throws IOException {
-
+ private static ObjectStream<POSSample> createPOSSampleStream() throws IOException {
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
file -> {
@@ -50,16 +53,40 @@ public class OntoNotes4PosTaggerEval {
return file.isDirectory();
}, true);
- ParseToPOSSampleStream samples = new ParseToPOSSampleStream(new OntoNotesParseSampleStream(
+ return new ParseToPOSSampleStream(new OntoNotesParseSampleStream(
new DocumentToLineStream(
- new FileToStringSampleStream(documentStream, Charset.forName("UTF-8")))));
+ new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8))));
+ }
- POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory());
- cv.evaluate(samples, 10);
+ private static void crossEval(TrainingParameters params, double expectedScore)
+ throws IOException {
+ try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
+ POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory());
+ cv.evaluate(samples, 10);
- Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
+ Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
+ }
}
+ @BeforeClass
+ public static void verifyTrainingData() throws IOException {
+ MessageDigest digest;
+ try {
+ digest = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalStateException(e);
+ }
+
+ try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
+ POSSample sample;
+ while ((sample = samples.read()) != null) {
+ digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+ }
+
+ Assert.assertEquals(new BigInteger("300430765214895870888056958221353356972"),
+ new BigInteger(1, digest.digest()));
+ }
+ }
@Test
public void evalEnglishMaxentTagger() throws IOException {
crossEval(ModelUtil.createDefaultTrainingParameters(), 0.9698145168879707d);