You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/02/07 14:19:18 UTC

opennlp git commit: OPENNLP-975: Add format support for CoNLL-U format

Repository: opennlp
Updated Branches:
  refs/heads/master 46fbcbf04 -> 740b6e341


OPENNLP-975: Add format support for CoNLL-U format


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/740b6e34
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/740b6e34
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/740b6e34

Branch: refs/heads/master
Commit: 740b6e34168a3cf38ba8ea88a9450babbe76393a
Parents: 46fbcbf
Author: J�rn Kottmann <jo...@apache.org>
Authored: Mon Feb 6 19:43:33 2017 +0100
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Tue Feb 7 15:01:54 2017 +0100

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |   5 +
 .../formats/conllu/ConlluLemmaSampleStream.java |  57 ++++++++
 .../conllu/ConlluLemmaSampleStreamFactory.java  |  82 ++++++++++++
 .../formats/conllu/ConlluPOSSampleStream.java   |  56 ++++++++
 .../conllu/ConlluPOSSampleStreamFactory.java    |  82 ++++++++++++
 .../tools/formats/conllu/ConlluSentence.java    |  33 +++++
 .../tools/formats/conllu/ConlluStream.java      |  75 +++++++++++
 .../tools/formats/conllu/ConlluTagset.java      |  23 ++++
 .../tools/formats/conllu/ConlluWordLine.java    | 130 +++++++++++++++++++
 .../formats/conllu/ConlluWordLineTest.java      |  43 ++++++
 10 files changed, 586 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 56625a9..9977519 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -42,6 +42,8 @@ import opennlp.tools.formats.ad.ADPOSSampleStreamFactory;
 import opennlp.tools.formats.ad.ADSentenceSampleStreamFactory;
 import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
 import opennlp.tools.formats.brat.BratNameSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory;
@@ -110,6 +112,9 @@ public final class StreamFactoryRegistry {
 
     LetsmtSentenceStreamFactory.registerFactory();
     MosesSentenceSampleStreamFactory.registerFactory();
+
+    ConlluPOSSampleStreamFactory.registerFactory();
+    ConlluLemmaSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
new file mode 100644
index 0000000..0782120
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluLemmaSampleStream extends FilterObjectStream<ConlluSentence, LemmaSample> {
+
+  private final ConlluTagset tagset;
+
+  ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
+    super(samples);
+    this.tagset = tagset;
+  }
+
+  @Override
+  public LemmaSample read() throws IOException {
+    ConlluSentence sentence = samples.read();
+
+    if (sentence != null) {
+      List<String> tokens = new ArrayList<>();
+      List<String> tags = new ArrayList<>();
+      List<String> lemmas = new ArrayList<>();
+
+      for (ConlluWordLine line : sentence.getWordLines()) {
+        tokens.add(line.getForm());
+        tags.add(line.getPosTag(tagset));
+        lemmas.add(line.getLemma());
+      }
+
+      return new LemmaSample(tokens, tags, lemmas);
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
new file mode 100644
index 0000000..4806967
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> {
+
+  public static final String CONLLU_FORMAT = "conllu";
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "tagset",
+        description = "u|x u for unified tags and x for language-specific part-of-speech tags")
+    @ArgumentParser.OptionalParameter(defaultValue = "u")
+    String getTagset();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LemmaSample.class,
+        CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class));
+  }
+
+  protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<LemmaSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    ConlluTagset tagset;
+
+    switch (params.getTagset()) {
+      case "u":
+        tagset = ConlluTagset.U;
+        break;
+      case  "x":
+        tagset = ConlluTagset.X;
+        break;
+      default:
+        throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset());
+    }
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluLemmaSampleStream(new ConlluStream(inFactory), tagset);
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
new file mode 100644
index 0000000..28dddc0
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluPOSSampleStream extends FilterObjectStream<ConlluSentence, POSSample> {
+
+  private final ConlluTagset tagset;
+
+  ConlluPOSSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
+    super(samples);
+    this.tagset = Objects.requireNonNull(tagset);
+  }
+
+  @Override
+  public POSSample read() throws IOException {
+    ConlluSentence sentence = samples.read();
+
+    if (sentence != null) {
+      List<String> tokens = new ArrayList<>();
+      List<String> tags = new ArrayList<>();
+
+      for (ConlluWordLine line : sentence.getWordLines()) {
+        tokens.add(line.getForm());
+        tags.add(line.getPosTag(tagset));
+      }
+
+      return new POSSample(tokens, tags);
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
new file mode 100644
index 0000000..0f9d5f3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class ConlluPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> {
+
+  public static final String CONLLU_FORMAT = "conllu";
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "tagset",
+        description = "u|x u for unified tags and x for language-specific part-of-speech tags")
+    @ArgumentParser.OptionalParameter(defaultValue = "u")
+    String getTagset();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(POSSample.class,
+        CONLLU_FORMAT, new ConlluPOSSampleStreamFactory(Parameters.class));
+  }
+
+  protected <P> ConlluPOSSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<POSSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    ConlluTagset tagset;
+
+    switch (params.getTagset()) {
+      case "u":
+        tagset = ConlluTagset.U;
+        break;
+      case  "x":
+        tagset = ConlluTagset.X;
+        break;
+      default:
+        throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset());
+    }
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluPOSSampleStream(new ConlluStream(inFactory), tagset);
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
new file mode 100644
index 0000000..5d92d89
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.util.List;
+
+public class ConlluSentence {
+
+  private List<ConlluWordLine> wordLines;
+
+  ConlluSentence(List<ConlluWordLine> wordLines) {
+    this.wordLines = wordLines;
+  }
+
+  public List<ConlluWordLine> getWordLines() {
+    return wordLines;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
new file mode 100644
index 0000000..873a9ed
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * The CoNNL-U Format is specified here:
+ * http://universaldependencies.org/format.html
+ */
+public class ConlluStream implements ObjectStream<ConlluSentence> {
+  private final ObjectStream<String> sentenceStream;
+
+  public ConlluStream(InputStreamFactory in) throws IOException {
+    this.sentenceStream = new ParagraphStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+  }
+
+  @Override
+  public ConlluSentence read() throws IOException {
+    String sentence = sentenceStream.read();
+
+    if (sentence != null) {
+      List<ConlluWordLine> wordLines = new ArrayList<>();
+
+      BufferedReader reader = new BufferedReader(new StringReader(sentence));
+
+      String line;
+      while ((line = reader.readLine())  != null) {
+        // # indicates a comment line and should be skipped
+        if (!line.trim().startsWith("#")) {
+          wordLines.add(new ConlluWordLine(line));
+        }
+      }
+
+      return new ConlluSentence(wordLines);
+    }
+
+    return null;
+  }
+
+  @Override
+  public void close() throws IOException {
+    sentenceStream.close();
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    sentenceStream.reset();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
new file mode 100644
index 0000000..f49f3fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+enum ConlluTagset {
+  U,
+  X
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
new file mode 100644
index 0000000..9881bf1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class ConlluWordLine {
+
+  private final String id;
+  private final String form;
+  private final String lemma;
+  private final String uPosTag;
+  private final String xPosTag;
+  private final String feats;
+  private final String head;
+  private final String deprel;
+  private final String deps;
+  private final String misc;
+
+  ConlluWordLine(String line) throws InvalidFormatException {
+
+    String[] fields = line.split("\t");
+
+    if (fields.length != 10) {
+      throw new InvalidFormatException("Line must have exactly 10 fields");
+    }
+
+    id = fields[0];
+    form = fields[1];
+    lemma = fields[2];
+    uPosTag = fields[3];
+    xPosTag = fields[4];
+    feats = fields[5];
+    head = fields[6];
+    deprel = fields[7];
+    deps = fields[8];
+    misc = fields[9];
+  }
+
+  /**
+   * Retrieves the word index. An Integer starting at 1 for each new sentence;
+   * may be a range for multiword tokens; may be a decimal number for empty nodes.
+   */
+  public String getId() {
+    return id;
+  }
+
+  /**
+   * Retrieve the word form or punctuation symbol.
+   */
+  public String getForm() {
+    return form;
+  }
+
+  /**
+   * Retrieve the lemma or stem of the word form.
+   */
+  public String getLemma() {
+    return lemma;
+  }
+
+  /**
+   * Retrieve the Universal part-of-speech tag or the language-specific part-of-speech tag;
+   * underscore if not available.
+   *
+   * @param tagset the type of tag to retrieve, either universial (u) or language specific (x)
+   */
+  public String getPosTag(ConlluTagset tagset) {
+    switch (tagset) {
+      case U:
+        return uPosTag;
+      case X:
+        return xPosTag;
+      default:
+        throw new IllegalStateException("Unexpected tagset value: " + tagset);
+    }
+  }
+
+  /**
+   * Retrieve list of morphological features from the universal feature inventory or from a
+   * defined language-specific extension; underscore if not available.
+   */
+  public String getFeats() {
+    return feats;
+  }
+
+  /**
+   * Head of the current word, which is either a value of ID or zero (0).
+   */
+  public String getHead() {
+    return head;
+  }
+
+  /**
+   * Universal dependency relation to the HEAD (root iff HEAD = 0) or a
+   * defined language-specific subtype of one.
+   */
+  public String getDeprel() {
+    return deprel;
+  }
+
+  /**
+   * Enhanced dependency graph in the form of a list of head-deprel pairs.
+   */
+  public String getDeps() {
+    return deps;
+  }
+
+  /**
+   * Retrieve any other annotation.
+   */
+  public String getMisc() {
+    return misc;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
new file mode 100644
index 0000000..4676f6f
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class ConlluWordLineTest {
+
+  @Test
+  public void testParseLine() throws InvalidFormatException {
+    ConlluWordLine line = new ConlluWordLine(
+        "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
+
+    Assert.assertEquals("12", line.getId());
+    Assert.assertEquals("Händen", line.getForm());
+    Assert.assertEquals("Hand", line.getLemma());
+    Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U));
+    Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X));
+    Assert.assertEquals("Case=Dat|Number=Plur", line.getFeats());
+    Assert.assertEquals("5", line.getHead());
+    Assert.assertEquals("nmod", line.getDeprel());
+    Assert.assertEquals("_", line.getDeps());
+    Assert.assertEquals("_", line.getMisc());
+  }
+}