You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/12/05 17:35:15 UTC
[opennlp] branch master updated: OPENNLP-1155: Remove deprecated
leipzig doccat format support
This is an automated email from the ASF dual-hosted git repository.
joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new a27bc32 OPENNLP-1155: Remove deprecated leipzig doccat format support
a27bc32 is described below
commit a27bc326901fb19558a2cc4cdd5fd64b000102c2
Author: thygesen <th...@apache.org>
AuthorDate: Tue Nov 21 12:38:55 2017 +0100
OPENNLP-1155: Remove deprecated leipzig doccat format support
---
.../tools/cmdline/StreamFactoryRegistry.java | 2 -
.../tools/formats/LeipzigDoccatSampleStream.java | 112 -----------------
.../LeipzigDocumentSampleStreamFactory.java | 86 -------------
.../opennlp/tools/eval/SourceForgeModelEval.java | 135 ++++++++++++++++-----
.../formats/LeipzigDoccatSampleStreamTest.java | 57 ---------
5 files changed, 108 insertions(+), 284 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index c078164..58bd87b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -30,7 +30,6 @@ import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
import opennlp.tools.formats.DocumentSampleStreamFactory;
import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
-import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
import opennlp.tools.formats.LemmatizerSampleStreamFactory;
import opennlp.tools.formats.NameSampleDataStreamFactory;
import opennlp.tools.formats.ParseSampleStreamFactory;
@@ -105,7 +104,6 @@ public final class StreamFactoryRegistry {
ConllXPOSSampleStreamFactory.registerFactory();
ConllXSentenceSampleStreamFactory.registerFactory();
ConllXTokenSampleStreamFactory.registerFactory();
- LeipzigDocumentSampleStreamFactory.registerFactory();
ADChunkSampleStreamFactory.registerFactory();
ADNameSampleStreamFactory.registerFactory();
ADSentenceSampleStreamFactory.registerFactory();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
deleted file mode 100644
index 7059e21..0000000
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.formats;
-
-import java.io.IOException;
-import java.io.PrintStream;
-import java.nio.charset.StandardCharsets;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.tokenize.SimpleTokenizer;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.util.FilterObjectStream;
-import opennlp.tools.util.InputStreamFactory;
-import opennlp.tools.util.PlainTextByLineStream;
-
-/**
- * Stream filter to produce document samples out of a Leipzig sentences.txt file.
- * In the Leipzig corpus the encoding of the various sentences.txt file is defined by
- * the language. The language must be specified to produce the category tags and is used
- * to determine the correct input encoding.
- * <p>
- * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
- * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.
- *
- * @deprecated will be removed, use the language detector instead
- */
-@Deprecated
-public class LeipzigDoccatSampleStream extends
- FilterObjectStream<String, DocumentSample> {
-
- private final Tokenizer tokenizer;
-
- private final String language;
- private final int sentencesPerDocument;
-
- /**
- * Creates a new LeipzigDoccatSampleStream with the specified parameters.
- *
- * @param language the Leipzig input sentences.txt file
- * @param sentencesPerDocument the number of sentences which
- * should be grouped into once {@link DocumentSample}
- * @param in the InputStream pointing to the contents of the sentences.txt input file
- * @throws IOException IOException
- */
- public LeipzigDoccatSampleStream(String language, int sentencesPerDocument, Tokenizer tokenizer,
- InputStreamFactory in) throws IOException {
- super(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
- System.setOut(new PrintStream(System.out, true, "UTF-8"));
- this.language = language;
- this.sentencesPerDocument = sentencesPerDocument;
- this.tokenizer = tokenizer;
- }
-
- /**
- * Creates a new LeipzigDoccatSampleStream with the specified parameters.
- *
- * @param language the Leipzig input sentences.txt file
- * @param sentencesPerDocument the number of sentences which should be
- * grouped into once {@link DocumentSample}
- * @param in the InputStream pointing to the contents of the sentences.txt input file
- * @throws IOException IOException
- */
- public LeipzigDoccatSampleStream(String language, int sentencesPerDocument,
- InputStreamFactory in) throws IOException {
- this(language, sentencesPerDocument, SimpleTokenizer.INSTANCE, in);
- }
-
- public DocumentSample read() throws IOException {
- int count = 0;
- List<String> tokensList = new ArrayList<>();
-
- String line;
- while (count < sentencesPerDocument && (line = samples.read()) != null) {
-
- String[] tokens = tokenizer.tokenize(line);
-
- if (tokens.length == 0) {
- throw new IOException("Empty lines are not allowed!");
- }
-
- // Always skip first token, that is the sentence number!
- tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length));
-
- count++;
- }
-
- if (tokensList.size() > 0) {
- return new DocumentSample(language, tokensList.toArray(new String[tokensList.size()]));
- }
-
- return null;
- }
-}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
deleted file mode 100644
index 133b7ea..0000000
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.formats;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-
-import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.cmdline.StreamFactoryRegistry;
-import opennlp.tools.cmdline.TerminateToolException;
-import opennlp.tools.cmdline.params.EncodingParameter;
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.ObjectStreamUtils;
-
-/**
- * <b>Note:</b> Do not use this class, internal use only!
- *
- * @deprecated will be removed, use the language detector instead
- */
-@Deprecated
-public class LeipzigDocumentSampleStreamFactory
- extends AbstractSampleStreamFactory<DocumentSample> {
-
- protected <P> LeipzigDocumentSampleStreamFactory(Class<P> params) {
- super(params);
- }
-
- public static void registerFactory() {
- StreamFactoryRegistry.registerFactory(DocumentSample.class,
- "leipzig", new LeipzigDocumentSampleStreamFactory(Parameters.class));
- }
-
- public ObjectStream<DocumentSample> create(String[] args) {
-
- Parameters params = ArgumentParser.parse(args, Parameters.class);
- File sentencesFileDir = params.getSentencesDir();
-
- File[] sentencesFiles = sentencesFileDir.listFiles(new FilenameFilter() {
- @Override
- public boolean accept(File dir, String name) {
- return name.contains("sentences") && name.endsWith(".txt");
- }
- });
-
- @SuppressWarnings("unchecked")
- ObjectStream<DocumentSample>[] sampleStreams =
- new ObjectStream[sentencesFiles.length];
-
- for (int i = 0; i < sentencesFiles.length; i++) {
- try {
- sampleStreams[i] = new LeipzigDoccatSampleStream(
- sentencesFiles[i].getName().substring(0, 3), 20,
- CmdLineUtil.createInputStreamFactory(sentencesFiles[i]));
- } catch (IOException e) {
- throw new TerminateToolException(-1, "IO error while opening sample data: " + e.getMessage(), e);
- }
- }
-
- return ObjectStreamUtils.concatenateObjectStream(sampleStreams);
- }
-
- interface Parameters extends EncodingParameter {
- @ParameterDescription(valueName = "sentencesDir",
- description = "dir with Leipig sentences to be used")
- File getSentencesDir();
- }
-}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index d47c14a..cc86b5e 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -22,6 +22,11 @@ import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
import org.junit.Assert;
import org.junit.BeforeClass;
@@ -31,8 +36,6 @@ import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.parser.ParserTool;
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.formats.LeipzigDoccatSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.namefind.TokenNameFinderModel;
@@ -47,12 +50,16 @@ import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
/**
@@ -80,11 +87,83 @@ import opennlp.tools.util.Span;
*/
public class SourceForgeModelEval extends AbstractEvalTest {
+ private static class LeipzigTestSample {
+ private final List<String> text;
+
+ private LeipzigTestSample(String[] text) {
+ Objects.requireNonNull(text, "text must not be null");
+ this.text = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(text)));
+ }
+
+ public String[] getText() {
+ return text.toArray(new String[text.size()]);
+ }
+
+ @Override
+ public String toString() {
+
+ StringBuilder sampleString = new StringBuilder("eng");
+
+ sampleString.append('\t');
+
+ for (String s : text) {
+ sampleString.append(s).append(' ');
+ }
+
+ if (sampleString.length() > 0) {
+ // remove last space
+ sampleString.setLength(sampleString.length() - 1);
+ }
+
+ return sampleString.toString();
+ }
+ }
+
+ private static class LeipzigTestSampleStream extends FilterObjectStream<String, LeipzigTestSample> {
+
+ private final int sentencePerDocument;
+ private final Tokenizer tokenizer;
+
+ private LeipzigTestSampleStream(int sentencePerDocument, Tokenizer tokenizer, InputStreamFactory in)
+ throws IOException {
+ super(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+ this.sentencePerDocument = sentencePerDocument;
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public LeipzigTestSample read() throws IOException {
+ int count = 0;
+ List<String> tokensList = new ArrayList<>();
+
+ String line;
+ while (count < sentencePerDocument && (line = samples.read()) != null) {
+
+ String[] tokens = tokenizer.tokenize(line);
+
+ if (tokens.length == 0) {
+ throw new IOException("Empty lines are not allowed!");
+ }
+
+ // Always skip first token, that is the sentence number!
+ tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length));
+
+ count++;
+ }
+
+ if (tokensList.size() > 0) {
+ return new LeipzigTestSample(tokensList.toArray(new String[tokensList.size()]));
+ }
+
+ return null;
+ }
+ }
+
@BeforeClass
public static void verifyTrainingData() throws Exception {
- verifyTrainingData(new LeipzigDoccatSampleStream("eng", 25,
+ verifyTrainingData(new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE,
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
- "leipzig/eng_news_2010_300K-sentences.txt"))),
+ "leipzig/eng_news_2010_300K-sentences.txt"))),
new BigInteger("172812413483919324675263268750583851712"));
}
@@ -92,7 +171,7 @@ public class SourceForgeModelEval extends AbstractEvalTest {
public void evalSentenceModel() throws Exception {
SentenceModel model = new SentenceModel(
- new File(getOpennlpDataDir(), "models-sf/en-sent.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-sent.bin"));
MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
@@ -100,11 +179,12 @@ public class SourceForgeModelEval extends AbstractEvalTest {
StringBuilder text = new StringBuilder();
- try (ObjectStream<DocumentSample> lineBatches = new LeipzigDoccatSampleStream("eng", 25,
- new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
- "leipzig/eng_news_2010_300K-sentences.txt")))) {
+ try (ObjectStream<LeipzigTestSample> lineBatches = new LeipzigTestSampleStream(25,
+ SimpleTokenizer.INSTANCE,
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "leipzig/eng_news_2010_300K-sentences.txt")))) {
- DocumentSample lineBatch;
+ LeipzigTestSample lineBatch;
while ((lineBatch = lineBatches.read()) != null) {
text.append(String.join(" ", lineBatch.getText())).append(" ");
}
@@ -117,7 +197,7 @@ public class SourceForgeModelEval extends AbstractEvalTest {
}
Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"),
- new BigInteger(1, digest.digest()));
+ new BigInteger(1, digest.digest()));
}
@Test
@@ -128,18 +208,18 @@ public class SourceForgeModelEval extends AbstractEvalTest {
// and then tokenize it here
TokenizerModel model = new TokenizerModel(
- new File(getOpennlpDataDir(), "models-sf/en-token.bin"));
+ new File(getOpennlpDataDir(), "models-sf/en-token.bin"));
MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
Tokenizer tokenizer = new TokenizerME(model);
- try (ObjectStream<DocumentSample> lines = new LeipzigDoccatSampleStream("eng", 1,
- WhitespaceTokenizer.INSTANCE,
- new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
- "leipzig/eng_news_2010_300K-sentences.txt")))) {
+ try (ObjectStream<LeipzigTestSample> lines = new LeipzigTestSampleStream(1,
+ WhitespaceTokenizer.INSTANCE,
+ new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
+ "leipzig/eng_news_2010_300K-sentences.txt")))) {
- DocumentSample line;
+ LeipzigTestSample line;
while ((line = lines.read()) != null) {
String[] tokens = tokenizer.tokenize(String.join(" ", line.getText()));
for (String token : tokens) {
@@ -149,11 +229,12 @@ public class SourceForgeModelEval extends AbstractEvalTest {
}
Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"),
- new BigInteger(1, digest.digest()));
+ new BigInteger(1, digest.digest()));
}
- private ObjectStream<DocumentSample> createLineWiseStream() throws IOException {
- return new LeipzigDoccatSampleStream("eng", 1,
+ private ObjectStream<LeipzigTestSample> createLineWiseStream() throws IOException {
+ return new LeipzigTestSampleStream(1,
+ SimpleTokenizer.INSTANCE,
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
"leipzig/eng_news_2010_300K-sentences.txt")));
}
@@ -166,9 +247,9 @@ public class SourceForgeModelEval extends AbstractEvalTest {
TokenNameFinder nameFinder = new NameFinderME(model);
- try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
+ try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) {
- DocumentSample line;
+ LeipzigTestSample line;
while ((line = lines.read()) != null) {
Span[] names = nameFinder.find(line.getText());
for (Span name : names) {
@@ -248,9 +329,9 @@ public class SourceForgeModelEval extends AbstractEvalTest {
Chunker chunker = new ChunkerME(new ChunkerModel(
new File(getOpennlpDataDir(), "models-sf/en-chunker.bin")));
- try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
+ try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) {
- DocumentSample line;
+ LeipzigTestSample line;
while ((line = lines.read()) != null) {
POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText()));
@@ -274,9 +355,9 @@ public class SourceForgeModelEval extends AbstractEvalTest {
POSTagger tagger = new POSTaggerME(model);
- try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
+ try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) {
- DocumentSample line;
+ LeipzigTestSample line;
while ((line = lines.read()) != null) {
String[] tags = tagger.tag(line.getText());
for (String tag : tags) {
@@ -314,9 +395,9 @@ public class SourceForgeModelEval extends AbstractEvalTest {
Parser parser = ParserFactory.create(model);
- try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
+ try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) {
- DocumentSample line;
+ LeipzigTestSample line;
while ((line = lines.read()) != null) {
Parse[] parse = ParserTool.parseLine(String.join(" ", line.getText()), parser, 1);
if (parse.length > 0) {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java
deleted file mode 100644
index 974f0c8..0000000
--- a/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.formats;
-
-import java.io.IOException;
-
-import org.junit.Test;
-
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.util.InputStreamFactory;
-import opennlp.tools.util.ObjectStream;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-public class LeipzigDoccatSampleStreamTest {
-
- @Test
- public void testParsingSample() throws IOException {
- InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
- "/opennlp/tools/formats/leipzig-en.sample");
-
- ObjectStream<DocumentSample> sampleStream =
- new LeipzigDoccatSampleStream("en", 2, in);
-
- DocumentSample doc1 = sampleStream.read();
- assertEquals("en", doc1.getCategory());
-
- DocumentSample doc2 = sampleStream.read();
- assertEquals("en", doc2.getCategory());
-
- DocumentSample doc3 = sampleStream.read();
- assertEquals("en", doc3.getCategory());
-
- DocumentSample doc4 = sampleStream.read();
- assertEquals("en", doc4.getCategory());
-
- assertNull(sampleStream.read());
-
- sampleStream.close();
- }
-}
--
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].