You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2016/12/26 13:34:03 UTC
[09/50] [abbrv] opennlp git commit: Replace private text with Leipzig
english news
Replace private text with Leipzig english news
See issue OPENNLP-877
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a7826d2b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a7826d2b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a7826d2b
Branch: refs/heads/889
Commit: a7826d2b86810ea0e094281c575b99d377021295
Parents: bbc5a34
Author: Joern Kottmann <ko...@gmail.com>
Authored: Thu Dec 15 21:03:28 2016 +0100
Committer: Joern Kottmann <ko...@gmail.com>
Committed: Mon Dec 19 10:36:36 2016 +0100
----------------------------------------------------------------------
.../formats/LeipzigDoccatSampleStream.java | 23 ++-
.../tools/eval/SourceForgeModelEval.java | 169 +++++++++++++------
2 files changed, 136 insertions(+), 56 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a7826d2b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 0af66ae..0ac318a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.PlainTextByLineStream;
@@ -40,6 +41,8 @@ import opennlp.tools.util.PlainTextByLineStream;
public class LeipzigDoccatSampleStream extends
FilterObjectStream<String, DocumentSample> {
+ private final Tokenizer tokenizer;
+
private final String language;
private final int sentencesPerDocument;
@@ -51,12 +54,26 @@ public class LeipzigDoccatSampleStream extends
* @param in the InputStream pointing to the contents of the sentences.txt input file
* @throws IOException IOException
*/
- LeipzigDoccatSampleStream(String language, int sentencesPerDocument,
- InputStreamFactory in) throws IOException {
+ public LeipzigDoccatSampleStream(String language, int sentencesPerDocument, Tokenizer tokenizer,
+ InputStreamFactory in) throws IOException {
super(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
System.setOut(new PrintStream(System.out, true, "UTF-8"));
this.language = language;
this.sentencesPerDocument = sentencesPerDocument;
+ this.tokenizer = tokenizer;
+ }
+
+ /**
+ * Creates a new LeipzigDoccatSampleStream with the specified parameters.
+ *
+ * @param language the Leipzig input sentences.txt file
+ * @param sentencesPerDocument the number of sentences which should be grouped into once {@link DocumentSample}
+ * @param in the InputStream pointing to the contents of the sentences.txt input file
+ * @throws IOException IOException
+ */
+ public LeipzigDoccatSampleStream(String language, int sentencesPerDocument,
+ InputStreamFactory in) throws IOException {
+ this(language, sentencesPerDocument, SimpleTokenizer.INSTANCE, in);
}
public DocumentSample read() throws IOException {
@@ -68,7 +85,7 @@ public class LeipzigDoccatSampleStream extends
String line;
while (count < sentencesPerDocument && (line = samples.read()) != null) {
- String tokens[] = SimpleTokenizer.INSTANCE.tokenize(line);
+ String tokens[] = tokenizer.tokenize(line);
if (tokens.length == 0) {
throw new IOException("Empty lines are not allowed!");
http://git-wip-us.apache.org/repos/asf/opennlp/blob/a7826d2b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index f63fcb5..d996afa 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -22,6 +22,7 @@ import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.parser.ParserTool;
+import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.formats.LeipzigDoccatSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
@@ -55,12 +56,27 @@ import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/**
- * The tests only run if the input text files are available and those
- * are derived from the leipzig corpus.
+ * This tests ensures that the existing SourceForge models perform
+ * like they are expected to.
*
- * Next step is to replace the input texts with ones that don't have license issues.
- * Wikinews is probably a vey good source. In addition also models that
- * can be shared are required to give everyone the possibilty to run this.
+ * To run this tests external the leipzig sentences files is needed:
+ * leipzig/eng_news_2010_300K-sentences.txt, this file can be
+ * obtained from the leipzig corpus project. <br>
+ *
+ * And all the SourceForge models:<br>
+ * - models-sf/en-sent.bin<br>
+ * - models-sf/en-token.bin<br>
+ * - models-sf/en-ner-date.bin<br>
+ * - models-sf/en-ner-location.binn<br>
+ * - models-sf/en-ner-money.bin<br>
+ * - models-sf/en-ner-organization.bin<br>
+ * - models-sf/en-ner-percentage.bi<br>
+ * - models-sf/en-ner-person.bin<br>
+ * - models-sf/en-ner-time.bin<br>
+ * - models-sf/en-chunker.bin<br>
+ * - models-sf/en-pos-maxent.bin<br>
+ * - models-sf/en-pos-perceptron.bin<br>
+ * - models-sf/en-parser-chunking.bin.bin<br>
*/
public class SourceForgeModelEval {
@@ -73,10 +89,28 @@ public class SourceForgeModelEval {
}
@Test
+ public void ensureTestDataIsCorrect() throws IOException {
+ MessageDigest digest = createDigest();
+
+ try (ObjectStream<String> lines = new PlainTextByLineStream(
+ new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
+ "leipzig/eng_news_2010_300K-sentences.txt")), Charset.forName("UTF-8"))) {
+
+ String line;
+ while ((line = lines.read()) != null) {
+ digest.update(line.getBytes("UTF-8"));
+ }
+
+ Assert.assertEquals(new BigInteger("248567841356936801447294643695012852392"),
+ new BigInteger(1, digest.digest()));
+ }
+ }
+
+ @Test
public void evalSentenceModel() throws IOException {
SentenceModel model = new SentenceModel(
- new File("/home/burn/opennlp-data-dir", "models-sf/en-sent.bin"));
+ new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));
MessageDigest digest = createDigest();
@@ -84,13 +118,16 @@ public class SourceForgeModelEval {
StringBuilder text = new StringBuilder();
- try (ObjectStream<String> lines = new PlainTextByLineStream(
- new MarkableFileInputStreamFactory(new File("/home/burn/opennlp-data-dir",
- "leipzig/sentences.txt")), Charset.forName("UTF-8"))) {
+ try (ObjectStream<DocumentSample> lineBatches = new LeipzigDoccatSampleStream("en", 25,
+ new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
+ "leipzig/eng_news_2010_300K-sentences.txt")))) {
- String line;
- while ((line = lines.read()) != null) {
- text.append(line).append(" ");
+ DocumentSample lineBatch ;
+ while ((lineBatch = lineBatches.read()) != null) {
+ // TODO: Replace with Java 8 join
+ for (String token : lineBatch.getText()) {
+ text.append(token).append(" ");
+ }
}
}
@@ -100,13 +137,17 @@ public class SourceForgeModelEval {
digest.update(sentence.getBytes("UTF-8"));
}
- Assert.assertEquals(new BigInteger("54058993675314170033586747935067060992"),
+ Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"),
new BigInteger(1, digest.digest()));
}
@Test
public void evalTokenModel() throws IOException {
+ // the input stream is currently tokenized, we should detokenize it again,
+ // (or extend to pass in tokenizer, then whitespace tokenizer can be passed)
+ // and then tokenize it here
+
TokenizerModel model = new TokenizerModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
@@ -114,23 +155,38 @@ public class SourceForgeModelEval {
Tokenizer tokenizer = new TokenizerME(model);
- try (ObjectStream<String> lines = new PlainTextByLineStream(
+ try (ObjectStream<DocumentSample> lines = new LeipzigDoccatSampleStream("en", 1,
+ WhitespaceTokenizer.INSTANCE,
new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
- "leipzig/sentences.txt")), Charset.forName("UTF-8"))) {
+ "leipzig/eng_news_2010_300K-sentences.txt")))) {
- String line;
+ DocumentSample line;
while ((line = lines.read()) != null) {
- String[] tokens = tokenizer.tokenize(line);
+
+ // TODO: Replace with Java 8 join
+ StringBuffer text = new StringBuffer();
+ for (String token : line.getText()) {
+ text.append(token).append(' ');
+ }
+
+ String[] tokens = tokenizer.tokenize(text.toString());
for (String token : tokens) {
digest.update(token.getBytes("UTF-8"));
}
}
}
- Assert.assertEquals(new BigInteger("309548448163611475251363008574168734058"),
+ Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"),
new BigInteger(1, digest.digest()));
}
+ private ObjectStream<DocumentSample> createLineWiseStream() throws IOException {
+ return new LeipzigDoccatSampleStream("en", 1,
+ new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
+ "leipzig/eng_news_2010_300K-sentences.txt")));
+ }
+
+
private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash)
throws IOException {
@@ -138,13 +194,11 @@ public class SourceForgeModelEval {
TokenNameFinder nameFinder = new NameFinderME(model);
- try (ObjectStream<String> lines = new PlainTextByLineStream(
- new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(), "leipzig/simpleTok.txt")),
- Charset.forName("UTF-8"))) {
+ try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
- String line;
+ DocumentSample line;
while ((line = lines.read()) != null) {
- Span[] names = nameFinder.find(WhitespaceTokenizer.INSTANCE.tokenize(line));
+ Span[] names = nameFinder.find(line.getText());
for (Span name : names) {
digest.update((name.getType() + name.getStart() + name.getEnd()).getBytes("UTF-8"));
}
@@ -159,7 +213,7 @@ public class SourceForgeModelEval {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-date.bin"));
- evalNameFinder(personModel, new BigInteger("13595680199220579055030594287753821185"));
+ evalNameFinder(personModel, new BigInteger("116570003910213570906062355532299200317"));
}
@Test
@@ -167,7 +221,7 @@ public class SourceForgeModelEval {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-location.bin"));
- evalNameFinder(personModel, new BigInteger("61423868331440897441202803979849564658"));
+ evalNameFinder(personModel, new BigInteger("44810593886021404716125849669208680993"));
}
@Test
@@ -175,7 +229,7 @@ public class SourceForgeModelEval {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-money.bin"));
- evalNameFinder(personModel, new BigInteger("31779803056581858429003932617173745364"));
+ evalNameFinder(personModel, new BigInteger("65248897509365807977219790824670047287"));
}
@Test
@@ -183,7 +237,7 @@ public class SourceForgeModelEval {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-organization.bin"));
- evalNameFinder(personModel, new BigInteger("268615755804346283904103340480818555730"));
+ evalNameFinder(personModel, new BigInteger("50454559690338630659278005157657197233"));
}
@Test
@@ -191,7 +245,7 @@ public class SourceForgeModelEval {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-percentage.bin"));
- evalNameFinder(personModel, new BigInteger("1793019183238911248412519564457497503"));
+ evalNameFinder(personModel, new BigInteger("320996882594215344113023719117249515343"));
}
@Test
@@ -207,26 +261,25 @@ public class SourceForgeModelEval {
TokenNameFinderModel personModel = new TokenNameFinderModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
- evalNameFinder(personModel, new BigInteger("264798318876255738642952635833268231353"));
+ evalNameFinder(personModel, new BigInteger("282941772380683328816791801782579055940"));
}
@Test
public void evalChunkerModel() throws IOException {
- ChunkerModel model = new ChunkerModel(
- new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-chunker.bin"));
-
MessageDigest digest = createDigest();
- Chunker chunker = new ChunkerME(model);
+ POSTagger tagger = new POSTaggerME(new POSModel(
+ new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")));
- try (ObjectStream<String> lines = new PlainTextByLineStream(
- new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(), "leipzig/simpleTokPos.txt")),
- Charset.forName("UTF-8"))) {
+ Chunker chunker = new ChunkerME(new ChunkerModel(
+ new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-chunker.bin")));
- String line;
+ try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
+
+ DocumentSample line;
while ((line = lines.read()) != null) {
- POSSample sentence = POSSample.parse(line);
+ POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText()));
String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags());
for (String chunk : chunks) {
@@ -235,22 +288,24 @@ public class SourceForgeModelEval {
}
}
- Assert.assertEquals(new BigInteger("87766988424222321513554054789708059330"),
+ Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"),
new BigInteger(1, digest.digest()));
}
private void evalPosModel(POSModel model, BigInteger expectedHash) throws IOException {
+
+ // break the input stream into sentences
+ // The input stream is tokenized and can be processed here directly
+
MessageDigest digest = createDigest();
POSTagger tagger = new POSTaggerME(model);
- try (ObjectStream<String> lines = new PlainTextByLineStream(
- new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
- "leipzig/simpleTok.txt")), Charset.forName("UTF-8"))) {
+ try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
- String line;
+ DocumentSample line;
while ((line = lines.read()) != null) {
- String[] tags = tagger.tag(WhitespaceTokenizer.INSTANCE.tokenize(line));
+ String[] tags = tagger.tag(line.getText());
for (String tag : tags) {
digest.update(tag.getBytes("UTF-8"));
}
@@ -265,7 +320,7 @@ public class SourceForgeModelEval {
POSModel maxentModel = new POSModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-maxent.bin"));
- evalPosModel(maxentModel, new BigInteger("6912278014292642909634347798602234960"));
+ evalPosModel(maxentModel, new BigInteger("231995214522232523777090597594904492687"));
}
@Test
@@ -273,28 +328,36 @@ public class SourceForgeModelEval {
POSModel perceptronModel = new POSModel(
new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"));
- evalPosModel(perceptronModel, new BigInteger("333081688760132868394207450128996236484"));
+ evalPosModel(perceptronModel, new BigInteger("209440430718727101220960491543652921728"));
}
@Test
public void evalParserModel() throws IOException {
+ // break input stream into sentences
+ // input is tokenized already
+
ParserModel model = new ParserModel(
- new File("/home/burn/opennlp-data-dir", "models-sf/en-parser-chunking.bin"));
+ new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-parser-chunking.bin"));
MessageDigest digest = createDigest();
Parser parser = ParserFactory.create(model);
- try (ObjectStream<String> lines = new PlainTextByLineStream(
- new MarkableFileInputStreamFactory(new File("/home/burn/opennlp-data-dir",
- "leipzig/simpleTok.txt")), Charset.forName("UTF-8"))) {
+ try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
- String line;
+ DocumentSample line;
while ((line = lines.read()) != null) {
- Parse[] parse = ParserTool.parseLine(line, parser, 1);
+ StringBuilder textLine = new StringBuilder();
+
+ // TODO: Replace with Java 8 join
+ for (String token : line.getText()) {
+ textLine.append(token).append(' ');
+ }
+
+ Parse[] parse = ParserTool.parseLine(textLine.toString(), parser, 1);
if (parse.length > 0) {
digest.update(parse[0].toString().getBytes("UTF-8"));
}
@@ -304,7 +367,7 @@ public class SourceForgeModelEval {
}
}
- Assert.assertEquals(new BigInteger("95566096874728850374427554294889512256"),
+ Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"),
new BigInteger(1, digest.digest()));
}
}