You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/24 14:51:46 UTC
opennlp git commit: OPENNLP-1075 Add streams for sentence and token
samples for conllu
Repository: opennlp
Updated Branches:
refs/heads/master d378c0656 -> 5bf5366e2
OPENNLP-1075 Add streams for sentence and token samples for conllu
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5bf5366e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5bf5366e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5bf5366e
Branch: refs/heads/master
Commit: 5bf5366e2d5eca700d33d5882b65a5795cb3d656
Parents: d378c06
Author: Jörn Kottmann <jo...@apache.org>
Authored: Tue May 23 17:28:33 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 16:29:51 2017 +0200
----------------------------------------------------------------------
.../tools/cmdline/StreamFactoryRegistry.java | 4 ++
.../conllu/ConlluLemmaSampleStreamFactory.java | 5 +-
.../tools/formats/conllu/ConlluSentence.java | 15 +++-
.../conllu/ConlluSentenceSampleStream.java | 59 +++++++++++++++
.../ConlluSentenceSampleStreamFactory.java | 65 +++++++++++++++++
.../tools/formats/conllu/ConlluStream.java | 30 +++++++-
.../formats/conllu/ConlluTokenSampleStream.java | 75 ++++++++++++++++++++
.../conllu/ConlluTokenSampleStreamFactory.java | 61 ++++++++++++++++
.../conllu/ConlluSentenceSampleStreamTest.java | 69 ++++++++++++++++++
.../tools/formats/conllu/ConlluStreamTest.java | 56 +++++++++++++++
.../conllu/ConlluTokenSampleStreamTest.java | 53 ++++++++++++++
.../formats/conllu/ConlluWordLineTest.java | 4 +-
.../formats/conllu/de-ud-train-sample.conllu | 30 ++++++++
13 files changed, 517 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 9977519..2cff212 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -44,6 +44,8 @@ import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
import opennlp.tools.formats.brat.BratNameSampleStreamFactory;
import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory;
import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluSentenceSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluTokenSampleStreamFactory;
import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory;
import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory;
import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory;
@@ -113,6 +115,8 @@ public final class StreamFactoryRegistry {
LetsmtSentenceStreamFactory.registerFactory();
MosesSentenceSampleStreamFactory.registerFactory();
+ ConlluTokenSampleStreamFactory.registerFactory();
+ ConlluSentenceSampleStreamFactory.registerFactory();
ConlluPOSSampleStreamFactory.registerFactory();
ConlluLemmaSampleStreamFactory.registerFactory();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
index 4806967..3204d7e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -34,8 +34,6 @@ import opennlp.tools.util.ObjectStream;
*/
public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> {
- public static final String CONLLU_FORMAT = "conllu";
-
interface Parameters extends BasicFormatParams {
@ArgumentParser.ParameterDescription(valueName = "tagset",
description = "u|x u for unified tags and x for language-specific part-of-speech tags")
@@ -45,7 +43,8 @@ public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<
public static void registerFactory() {
StreamFactoryRegistry.registerFactory(LemmaSample.class,
- CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class));
+ ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+ new ConlluLemmaSampleStreamFactory(Parameters.class));
}
protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
index 5d92d89..bbd2b96 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -23,11 +23,24 @@ public class ConlluSentence {
private List<ConlluWordLine> wordLines;
- ConlluSentence(List<ConlluWordLine> wordLines) {
+ private String sentenceIdComment;
+ private String textComment;
+
+ ConlluSentence(List<ConlluWordLine> wordLines, String sentenceIdComment, String textComment) {
this.wordLines = wordLines;
+ this.sentenceIdComment = sentenceIdComment;
+ this.textComment = textComment;
}
public List<ConlluWordLine> getWordLines() {
return wordLines;
}
+
+ public String getSentenceIdComment() {
+ return sentenceIdComment;
+ }
+
+ public String getTextComment() {
+ return textComment;
+ }
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
new file mode 100644
index 0000000..f49e205
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStream extends FilterObjectStream<ConlluSentence, SentenceSample> {
+
+ private final int sentencesPerSample;
+
+ public ConlluSentenceSampleStream(ObjectStream<ConlluSentence> samples, int sentencesPerSample) {
+ super(samples);
+ this.sentencesPerSample = sentencesPerSample;
+ }
+
+ @Override
+ public SentenceSample read() throws IOException {
+ StringBuilder documentText = new StringBuilder();
+
+ List<Span> sentenceSpans = new ArrayList<>();
+
+ ConlluSentence sentence;
+ for (int i = 0; i < sentencesPerSample && (sentence = samples.read()) != null; i++) {
+
+ int startIndex = documentText.length();
+ documentText.append(sentence.getTextComment()).append(' ');
+ sentenceSpans.add(new Span(startIndex, documentText.length() - 1));
+ }
+
+ if (documentText.length() > 0) {
+ documentText.setLength(documentText.length() - 1);
+ return new SentenceSample(documentText, sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+ }
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..000af27
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+ interface Parameters extends BasicFormatParams {
+ @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+ description = "number of sentences per sample")
+ String getSentencesPerSample();
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(SentenceSample.class,
+ ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+ new ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class));
+ }
+
+ protected <P> ConlluSentenceSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ @Override
+ public ObjectStream<SentenceSample> create(String[] args) {
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ InputStreamFactory inFactory =
+ CmdLineUtil.createInputStreamFactory(params.getData());
+
+ try {
+ return new ConlluSentenceSampleStream(new ConlluStream(inFactory),
+ Integer.parseInt(params.getSentencesPerSample()));
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index 873a9ed..cbac450 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -49,15 +49,39 @@ public class ConlluStream implements ObjectStream<ConlluSentence> {
BufferedReader reader = new BufferedReader(new StringReader(sentence));
+ String sentenceId = null;
+ String text = null;
+
String line;
while ((line = reader.readLine()) != null) {
- // # indicates a comment line and should be skipped
- if (!line.trim().startsWith("#")) {
+ // # indicates a comment line and contains additional data
+ if (line.trim().startsWith("#")) {
+ String commentLine = line.trim().substring(1);
+
+ int separator = commentLine.indexOf('=');
+
+ if (separator != -1) {
+ String firstPart = commentLine.substring(0, separator).trim();
+ String secondPart = commentLine.substring(separator + 1, commentLine.length()).trim();
+
+ if (!secondPart.isEmpty()) {
+ switch (firstPart) {
+ case "sent_id":
+ sentenceId = secondPart;
+ break;
+ case "text":
+ text = secondPart;
+ break;
+ }
+ }
+ }
+ }
+ else {
wordLines.add(new ConlluWordLine(line));
}
}
- return new ConlluSentence(wordLines);
+ return new ConlluSentence(wordLines, sentenceId, text);
}
return null;
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
new file mode 100644
index 0000000..a9ad937
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence, TokenSample> {
+
+ public ConlluTokenSampleStream(ObjectStream<ConlluSentence> samples) {
+ super(samples);
+ }
+
+ @Override
+ public TokenSample read() throws IOException {
+ ConlluSentence sentence = samples.read();
+ if (sentence != null) {
+ if (sentence.getTextComment() != null) {
+ StringBuilder text = new StringBuilder(sentence.getTextComment());
+ int searchIndex = 0;
+
+ for (ConlluWordLine wordLine : sentence.getWordLines()) {
+
+ // skip over inserted words which are not in the source text
+ if (wordLine.getId().contains(".")) {
+ continue;
+ }
+
+ String token = wordLine.getForm();
+ int tokenIndex = text.indexOf(token, searchIndex);
+
+ if (tokenIndex == -1) {
+ throw new IOException(String.format("Failed to match token [%s] in sentence [%s] with text [%s]",
+ token, sentence.getSentenceIdComment(), text));
+ }
+
+ int charAfterTokenIndex = tokenIndex + token.length();
+ if (charAfterTokenIndex < text.length()) {
+ if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) {
+ text.insert(charAfterTokenIndex,
+ TokenSample.DEFAULT_SEPARATOR_CHARS);
+ searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length();
+ }
+
+ searchIndex += token.length();
+ }
+ }
+ return TokenSample.parse(text.toString(), TokenSample.DEFAULT_SEPARATOR_CHARS);
+ }
+ else {
+ throw new IOException("Sentence is missing raw text sample!");
+ }
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
new file mode 100644
index 0000000..5db0407
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamFactory extends AbstractSampleStreamFactory<TokenSample> {
+
+ interface Parameters extends BasicFormatParams {
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(TokenSample.class,
+ ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+ new ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class));
+ }
+
+ protected <P> ConlluTokenSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ @Override
+ public ObjectStream<TokenSample> create(String[] args) {
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ InputStreamFactory inFactory =
+ CmdLineUtil.createInputStreamFactory(params.getData());
+
+ try {
+ return new ConlluTokenSampleStream(new ConlluStream(inFactory));
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
new file mode 100644
index 0000000..d45d38f
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStreamTest {
+
+ @Test
+ public void testParseTwoSentences() throws IOException {
+ InputStreamFactory streamFactory =
+ new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+ try (ObjectStream<SentenceSample> stream =
+ new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 1)) {
+
+ SentenceSample sample1 = stream.read();
+
+ Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.",
+ sample1.getDocument());
+
+ Assert.assertEquals(new Span(0, 65), sample1.getSentences()[0]);
+
+ SentenceSample sample2 = stream.read();
+
+ Assert.assertEquals("Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch " +
+ "endlich keine Rückenschmerzen mehr.", sample2.getDocument());
+ Assert.assertEquals(new Span(0, 95), sample2.getSentences()[0]);
+
+ Assert.assertNull("Stream must be exhausted", stream.read());
+ }
+
+ try (ObjectStream<SentenceSample> stream =
+ new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 3)) {
+ SentenceSample sample = stream.read();
+
+ Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team."
+ + " Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine "
+ + "Rückenschmerzen mehr.",
+ sample.getDocument());
+
+ Assert.assertNull("Stream must be exhausted", stream.read());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
new file mode 100644
index 0000000..63968a1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluStreamTest {
+
+ @Test
+ public void testParseTwoSentences() throws IOException {
+
+ InputStreamFactory streamFactory =
+ new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+ try (ObjectStream<ConlluSentence> stream = new ConlluStream(streamFactory)) {
+ ConlluSentence sent1 = stream.read();
+
+ Assert.assertEquals("train-s21", sent1.getSentenceIdComment());
+ Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.",
+ sent1.getTextComment());
+ Assert.assertEquals(11, sent1.getWordLines().size());
+
+ ConlluSentence sent2 = stream.read();
+
+ Assert.assertEquals("train-s22", sent2.getSentenceIdComment());
+ Assert.assertEquals(
+ "Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.",
+ sent2.getTextComment());
+ Assert.assertEquals(14, sent2.getWordLines().size());
+
+ Assert.assertNull("Stream must be exhausted", stream.read());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
new file mode 100644
index 0000000..62cb9a6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamTest {
+
+ @Test
+ public void testParseTwoSentences() throws IOException {
+ InputStreamFactory streamFactory =
+ new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+ try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) {
+
+ TokenSample expected1 = TokenSample.parse(
+ "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS
+ + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS
+ + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+ Assert.assertEquals(expected1, stream.read());
+
+ TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke ich einen " +
+ "neuen Biss und dadurch endlich keine Rückenschmerzen mehr"
+ + TokenSample.DEFAULT_SEPARATOR_CHARS + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+ Assert.assertEquals(expected2, stream.read());
+
+ Assert.assertNull("Stream must be exhausted", stream.read());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
index 4676f6f..005ec55 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
@@ -27,10 +27,10 @@ public class ConlluWordLineTest {
@Test
public void testParseLine() throws InvalidFormatException {
ConlluWordLine line = new ConlluWordLine(
- "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
+ "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
Assert.assertEquals("12", line.getId());
- Assert.assertEquals("Händen", line.getForm());
+ Assert.assertEquals("Händen", line.getForm());
Assert.assertEquals("Hand", line.getLemma());
Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U));
Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X));
http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
new file mode 100644
index 0000000..13c19da
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
@@ -0,0 +1,30 @@
+# sent_id = train-s21
+# text = Fachlich kompetent, sehr gute Beratung und ein freundliches Team.
+1 Fachlich fachlich ADV ADJD _ 2 advmod _ _
+2 kompetent kompetent ADJ ADJD Degree=Pos 0 root _ SpaceAfter=No
+3 , , PUNCT $, _ 2 punct _ _
+4 sehr sehr ADV ADV _ 5 advmod _ _
+5 gute gut ADJ ADJA Degree=Pos 6 amod _ _
+6 Beratung Beratung NOUN NN _ 2 parataxis _ _
+7 und und CCONJ KON _ 10 cc _ _
+8 ein ein DET ART Definite=Ind|PronType=Art 10 det _ _
+9 freundliches freundlich ADJ ADJA Degree=Pos 10 amod _ _
+10 Team Team NOUN NN _ 6 conj _ SpaceAfter=No
+11 . . PUNCT $. _ 2 punct _ _
+
+# sent_id = train-s22
+# text = Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.
+1 Beiden beide PRON PIAT Case=Dat|Number=Plur|NumType=Card|PronType=Tot 2 det _ _
+2 Zahnärzten Zahnarzt NOUN NN Case=Dat|Number=Plur 3 iobj _ _
+3 verdanke verdanken VERB VVFIN Number=Sing|Person=1|VerbForm=Fin 0 root _ _
+4 ich ich PRON PPER Case=Nom|Number=Sing|Person=1|PronType=Prs 3 nsubj _ _
+5 einen ein DET ART Case=Acc|Definite=Ind|Number=Plur|PronType=Art 7 det _ _
+6 neuen neu ADJ ADJA Case=Acc|Degree=Pos|Number=Plur 7 amod _ _
+7 Biss Biß NOUN NN Case=Acc|Number=Plur 3 obj _ _
+8 und und CCONJ KON _ 12 cc _ _
+9 dadurch dadurch ADV PAV _ 7 advmod _ _
+10 endlich endlich ADV ADV _ 12 advmod _ _
+11 keine kein PRON PIAT PronType=Neg 12 advmod _ _
+12 Rückenschmerzen Rückenschmerz NOUN NN _ 7 conj _ _
+13 mehr mehr ADV ADV _ 12 advmod _ SpaceAfter=No
+14 . . PUNCT $. _ 3 punct _ _
\ No newline at end of file