You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/24 14:51:46 UTC

opennlp git commit: OPENNLP-1075 Add streams for sentence and token samples for conllu

Repository: opennlp
Updated Branches:
  refs/heads/master d378c0656 -> 5bf5366e2


OPENNLP-1075 Add streams for sentence and token samples for conllu


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5bf5366e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5bf5366e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5bf5366e

Branch: refs/heads/master
Commit: 5bf5366e2d5eca700d33d5882b65a5795cb3d656
Parents: d378c06
Author: Jörn Kottmann <jo...@apache.org>
Authored: Tue May 23 17:28:33 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 16:29:51 2017 +0200

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |  4 ++
 .../conllu/ConlluLemmaSampleStreamFactory.java  |  5 +-
 .../tools/formats/conllu/ConlluSentence.java    | 15 +++-
 .../conllu/ConlluSentenceSampleStream.java      | 59 +++++++++++++++
 .../ConlluSentenceSampleStreamFactory.java      | 65 +++++++++++++++++
 .../tools/formats/conllu/ConlluStream.java      | 30 +++++++-
 .../formats/conllu/ConlluTokenSampleStream.java | 75 ++++++++++++++++++++
 .../conllu/ConlluTokenSampleStreamFactory.java  | 61 ++++++++++++++++
 .../conllu/ConlluSentenceSampleStreamTest.java  | 69 ++++++++++++++++++
 .../tools/formats/conllu/ConlluStreamTest.java  | 56 +++++++++++++++
 .../conllu/ConlluTokenSampleStreamTest.java     | 53 ++++++++++++++
 .../formats/conllu/ConlluWordLineTest.java      |  4 +-
 .../formats/conllu/de-ud-train-sample.conllu    | 30 ++++++++
 13 files changed, 517 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 9977519..2cff212 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -44,6 +44,8 @@ import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
 import opennlp.tools.formats.brat.BratNameSampleStreamFactory;
 import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory;
 import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluSentenceSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory;
@@ -113,6 +115,8 @@ public final class StreamFactoryRegistry {
     LetsmtSentenceStreamFactory.registerFactory();
     MosesSentenceSampleStreamFactory.registerFactory();
 
+    ConlluTokenSampleStreamFactory.registerFactory();
+    ConlluSentenceSampleStreamFactory.registerFactory();
     ConlluPOSSampleStreamFactory.registerFactory();
     ConlluLemmaSampleStreamFactory.registerFactory();
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
index 4806967..3204d7e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -34,8 +34,6 @@ import opennlp.tools.util.ObjectStream;
  */
 public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> {
 
-  public static final String CONLLU_FORMAT = "conllu";
-
   interface Parameters extends BasicFormatParams {
     @ArgumentParser.ParameterDescription(valueName = "tagset",
         description = "u|x u for unified tags and x for language-specific part-of-speech tags")
@@ -45,7 +43,8 @@ public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<
 
   public static void registerFactory() {
     StreamFactoryRegistry.registerFactory(LemmaSample.class,
-        CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class));
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluLemmaSampleStreamFactory(Parameters.class));
   }
 
   protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
index 5d92d89..bbd2b96 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -23,11 +23,24 @@ public class ConlluSentence {
 
   private List<ConlluWordLine> wordLines;
 
-  ConlluSentence(List<ConlluWordLine> wordLines) {
+  private String sentenceIdComment;
+  private String textComment;
+
+  ConlluSentence(List<ConlluWordLine> wordLines, String sentenceIdComment, String textComment) {
     this.wordLines = wordLines;
+    this.sentenceIdComment = sentenceIdComment;
+    this.textComment = textComment;
   }
 
   public List<ConlluWordLine> getWordLines() {
     return wordLines;
   }
+
+  public String getSentenceIdComment() {
+    return sentenceIdComment;
+  }
+
+  public String getTextComment() {
+    return textComment;
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
new file mode 100644
index 0000000..f49e205
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStream extends FilterObjectStream<ConlluSentence, SentenceSample> {
+
+  private final int sentencesPerSample;
+
+  public ConlluSentenceSampleStream(ObjectStream<ConlluSentence> samples, int sentencesPerSample) {
+    super(samples);
+    this.sentencesPerSample = sentencesPerSample;
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+    StringBuilder documentText = new StringBuilder();
+
+    List<Span> sentenceSpans = new ArrayList<>();
+
+    ConlluSentence sentence;
+    for (int i = 0; i <  sentencesPerSample && (sentence = samples.read()) != null; i++) {
+
+      int startIndex = documentText.length();
+      documentText.append(sentence.getTextComment()).append(' ');
+      sentenceSpans.add(new Span(startIndex, documentText.length() - 1));
+    }
+
+    if (documentText.length() > 0) {
+      documentText.setLength(documentText.length() - 1);
+      return new SentenceSample(documentText, sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..000af27
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> ConlluSentenceSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluSentenceSampleStream(new ConlluStream(inFactory),
+          Integer.parseInt(params.getSentencesPerSample()));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index 873a9ed..cbac450 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -49,15 +49,39 @@ public class ConlluStream implements ObjectStream<ConlluSentence> {
 
       BufferedReader reader = new BufferedReader(new StringReader(sentence));
 
+      String sentenceId = null;
+      String text = null;
+
       String line;
       while ((line = reader.readLine())  != null) {
-        // # indicates a comment line and should be skipped
-        if (!line.trim().startsWith("#")) {
+        // # indicates a comment line and contains additional data
+        if (line.trim().startsWith("#")) {
+          String commentLine = line.trim().substring(1);
+
+          int separator = commentLine.indexOf('=');
+
+          if (separator != -1) {
+            String firstPart = commentLine.substring(0, separator).trim();
+            String secondPart = commentLine.substring(separator + 1, commentLine.length()).trim();
+
+            if (!secondPart.isEmpty()) {
+              switch (firstPart) {
+                case "sent_id":
+                  sentenceId = secondPart;
+                  break;
+                case "text":
+                  text = secondPart;
+                  break;
+              }
+            }
+          }
+        }
+        else {
           wordLines.add(new ConlluWordLine(line));
         }
       }
 
-      return new ConlluSentence(wordLines);
+      return new ConlluSentence(wordLines, sentenceId, text);
     }
 
     return null;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
new file mode 100644
index 0000000..a9ad937
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence, TokenSample> {
+
+  public ConlluTokenSampleStream(ObjectStream<ConlluSentence> samples) {
+    super(samples);
+  }
+
+  @Override
+  public TokenSample read() throws IOException {
+    ConlluSentence sentence = samples.read();
+    if (sentence != null) {
+      if (sentence.getTextComment() != null) {
+        StringBuilder text = new StringBuilder(sentence.getTextComment());
+        int searchIndex = 0;
+
+        for (ConlluWordLine wordLine : sentence.getWordLines()) {
+
+          // skip over inserted words which are not in the source text
+          if (wordLine.getId().contains(".")) {
+            continue;
+          }
+
+          String token = wordLine.getForm();
+          int tokenIndex = text.indexOf(token, searchIndex);
+
+          if (tokenIndex == -1) {
+            throw new IOException(String.format("Failed to match token [%s] in sentence [%s] with text [%s]",
+                token, sentence.getSentenceIdComment(), text));
+          }
+
+          int charAfterTokenIndex = tokenIndex + token.length();
+          if (charAfterTokenIndex < text.length()) {
+            if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) {
+              text.insert(charAfterTokenIndex,
+                  TokenSample.DEFAULT_SEPARATOR_CHARS);
+              searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length();
+            }
+
+            searchIndex += token.length();
+          }
+        }
+        return TokenSample.parse(text.toString(), TokenSample.DEFAULT_SEPARATOR_CHARS);
+      }
+      else {
+        throw new IOException("Sentence is missing raw text sample!");
+      }
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
new file mode 100644
index 0000000..5db0407
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamFactory extends AbstractSampleStreamFactory<TokenSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(TokenSample.class,
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> ConlluTokenSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<TokenSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluTokenSampleStream(new ConlluStream(inFactory));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
new file mode 100644
index 0000000..d45d38f
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+    try (ObjectStream<SentenceSample> stream =
+             new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 1)) {
+
+      SentenceSample sample1 = stream.read();
+
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.",
+          sample1.getDocument());
+
+      Assert.assertEquals(new Span(0, 65), sample1.getSentences()[0]);
+
+      SentenceSample sample2 = stream.read();
+
+      Assert.assertEquals("Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch " +
+          "endlich keine Rückenschmerzen mehr.", sample2.getDocument());
+      Assert.assertEquals(new Span(0, 95), sample2.getSentences()[0]);
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+
+    try (ObjectStream<SentenceSample> stream =
+             new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 3)) {
+      SentenceSample sample = stream.read();
+
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team."
+           + " Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine "
+           + "Rückenschmerzen mehr.",
+          sample.getDocument());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
new file mode 100644
index 0000000..63968a1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+    try (ObjectStream<ConlluSentence> stream = new ConlluStream(streamFactory)) {
+      ConlluSentence sent1 = stream.read();
+
+      Assert.assertEquals("train-s21", sent1.getSentenceIdComment());
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.",
+          sent1.getTextComment());
+      Assert.assertEquals(11, sent1.getWordLines().size());
+
+      ConlluSentence sent2 = stream.read();
+
+      Assert.assertEquals("train-s22", sent2.getSentenceIdComment());
+      Assert.assertEquals(
+          "Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.",
+          sent2.getTextComment());
+      Assert.assertEquals(14, sent2.getWordLines().size());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
new file mode 100644
index 0000000..62cb9a6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS
+          + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS
+          + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+      Assert.assertEquals(expected1, stream.read());
+
+      TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke ich einen " +
+          "neuen Biss und dadurch endlich keine Rückenschmerzen mehr"
+          + TokenSample.DEFAULT_SEPARATOR_CHARS + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+      Assert.assertEquals(expected2, stream.read());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
index 4676f6f..005ec55 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
@@ -27,10 +27,10 @@ public class ConlluWordLineTest {
   @Test
   public void testParseLine() throws InvalidFormatException {
     ConlluWordLine line = new ConlluWordLine(
-        "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
+        "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
 
     Assert.assertEquals("12", line.getId());
-    Assert.assertEquals("Händen", line.getForm());
+    Assert.assertEquals("Händen", line.getForm());
     Assert.assertEquals("Hand", line.getLemma());
     Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U));
     Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X));

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
new file mode 100644
index 0000000..13c19da
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
@@ -0,0 +1,30 @@
+# sent_id = train-s21
+# text = Fachlich kompetent, sehr gute Beratung und ein freundliches Team.
+1	Fachlich	fachlich	ADV	ADJD	_	2	advmod	_	_
+2	kompetent	kompetent	ADJ	ADJD	Degree=Pos	0	root	_	SpaceAfter=No
+3	,	,	PUNCT	$,	_	2	punct	_	_
+4	sehr	sehr	ADV	ADV	_	5	advmod	_	_
+5	gute	gut	ADJ	ADJA	Degree=Pos	6	amod	_	_
+6	Beratung	Beratung	NOUN	NN	_	2	parataxis	_	_
+7	und	und	CCONJ	KON	_	10	cc	_	_
+8	ein	ein	DET	ART	Definite=Ind|PronType=Art	10	det	_	_
+9	freundliches	freundlich	ADJ	ADJA	Degree=Pos	10	amod	_	_
+10	Team	Team	NOUN	NN	_	6	conj	_	SpaceAfter=No
+11	.	.	PUNCT	$.	_	2	punct	_	_
+
+# sent_id = train-s22
+# text = Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.
+1	Beiden	beide	PRON	PIAT	Case=Dat|Number=Plur|NumType=Card|PronType=Tot	2	det	_	_
+2	Zahnärzten	Zahnarzt	NOUN	NN	Case=Dat|Number=Plur	3	iobj	_	_
+3	verdanke	verdanken	VERB	VVFIN	Number=Sing|Person=1|VerbForm=Fin	0	root	_	_
+4	ich	ich	PRON	PPER	Case=Nom|Number=Sing|Person=1|PronType=Prs	3	nsubj	_	_
+5	einen	ein	DET	ART	Case=Acc|Definite=Ind|Number=Plur|PronType=Art	7	det	_	_
+6	neuen	neu	ADJ	ADJA	Case=Acc|Degree=Pos|Number=Plur	7	amod	_	_
+7	Biss	Biß	NOUN	NN	Case=Acc|Number=Plur	3	obj	_	_
+8	und	und	CCONJ	KON	_	12	cc	_	_
+9	dadurch	dadurch	ADV	PAV	_	7	advmod	_	_
+10	endlich	endlich	ADV	ADV	_	12	advmod	_	_
+11	keine	kein	PRON	PIAT	PronType=Neg	12	advmod	_	_
+12	Rückenschmerzen	Rückenschmerz	NOUN	NN	_	7	conj	_	_
+13	mehr	mehr	ADV	ADV	_	12	advmod	_	SpaceAfter=No
+14	.	.	PUNCT	$.	_	3	punct	_	_
\ No newline at end of file