You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/02/07 14:19:18 UTC
opennlp git commit: OPENNLP-975: Add format support for CoNLL-U format
Repository: opennlp
Updated Branches:
refs/heads/master 46fbcbf04 -> 740b6e341
OPENNLP-975: Add format support for CoNLL-U format
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/740b6e34
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/740b6e34
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/740b6e34
Branch: refs/heads/master
Commit: 740b6e34168a3cf38ba8ea88a9450babbe76393a
Parents: 46fbcbf
Author: J�rn Kottmann <jo...@apache.org>
Authored: Mon Feb 6 19:43:33 2017 +0100
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Tue Feb 7 15:01:54 2017 +0100
----------------------------------------------------------------------
.../tools/cmdline/StreamFactoryRegistry.java | 5 +
.../formats/conllu/ConlluLemmaSampleStream.java | 57 ++++++++
.../conllu/ConlluLemmaSampleStreamFactory.java | 82 ++++++++++++
.../formats/conllu/ConlluPOSSampleStream.java | 56 ++++++++
.../conllu/ConlluPOSSampleStreamFactory.java | 82 ++++++++++++
.../tools/formats/conllu/ConlluSentence.java | 33 +++++
.../tools/formats/conllu/ConlluStream.java | 75 +++++++++++
.../tools/formats/conllu/ConlluTagset.java | 23 ++++
.../tools/formats/conllu/ConlluWordLine.java | 130 +++++++++++++++++++
.../formats/conllu/ConlluWordLineTest.java | 43 ++++++
10 files changed, 586 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 56625a9..9977519 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -42,6 +42,8 @@ import opennlp.tools.formats.ad.ADPOSSampleStreamFactory;
import opennlp.tools.formats.ad.ADSentenceSampleStreamFactory;
import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
import opennlp.tools.formats.brat.BratNameSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory;
import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory;
import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory;
import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory;
@@ -110,6 +112,9 @@ public final class StreamFactoryRegistry {
LetsmtSentenceStreamFactory.registerFactory();
MosesSentenceSampleStreamFactory.registerFactory();
+
+ ConlluPOSSampleStreamFactory.registerFactory();
+ ConlluLemmaSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
new file mode 100644
index 0000000..0782120
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluLemmaSampleStream extends FilterObjectStream<ConlluSentence, LemmaSample> {
+
+ private final ConlluTagset tagset;
+
+ ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
+ super(samples);
+ this.tagset = tagset;
+ }
+
+ @Override
+ public LemmaSample read() throws IOException {
+ ConlluSentence sentence = samples.read();
+
+ if (sentence != null) {
+ List<String> tokens = new ArrayList<>();
+ List<String> tags = new ArrayList<>();
+ List<String> lemmas = new ArrayList<>();
+
+ for (ConlluWordLine line : sentence.getWordLines()) {
+ tokens.add(line.getForm());
+ tags.add(line.getPosTag(tagset));
+ lemmas.add(line.getLemma());
+ }
+
+ return new LemmaSample(tokens, tags, lemmas);
+ }
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
new file mode 100644
index 0000000..4806967
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> {
+
+ public static final String CONLLU_FORMAT = "conllu";
+
+ interface Parameters extends BasicFormatParams {
+ @ArgumentParser.ParameterDescription(valueName = "tagset",
+ description = "u|x u for unified tags and x for language-specific part-of-speech tags")
+ @ArgumentParser.OptionalParameter(defaultValue = "u")
+ String getTagset();
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(LemmaSample.class,
+ CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class));
+ }
+
+ protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public ObjectStream<LemmaSample> create(String[] args) {
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ ConlluTagset tagset;
+
+ switch (params.getTagset()) {
+ case "u":
+ tagset = ConlluTagset.U;
+ break;
+ case "x":
+ tagset = ConlluTagset.X;
+ break;
+ default:
+ throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset());
+ }
+
+ InputStreamFactory inFactory =
+ CmdLineUtil.createInputStreamFactory(params.getData());
+
+ try {
+ return new ConlluLemmaSampleStream(new ConlluStream(inFactory), tagset);
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
new file mode 100644
index 0000000..28dddc0
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluPOSSampleStream extends FilterObjectStream<ConlluSentence, POSSample> {
+
+ private final ConlluTagset tagset;
+
+ ConlluPOSSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
+ super(samples);
+ this.tagset = Objects.requireNonNull(tagset);
+ }
+
+ @Override
+ public POSSample read() throws IOException {
+ ConlluSentence sentence = samples.read();
+
+ if (sentence != null) {
+ List<String> tokens = new ArrayList<>();
+ List<String> tags = new ArrayList<>();
+
+ for (ConlluWordLine line : sentence.getWordLines()) {
+ tokens.add(line.getForm());
+ tags.add(line.getPosTag(tagset));
+ }
+
+ return new POSSample(tokens, tags);
+ }
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
new file mode 100644
index 0000000..0f9d5f3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class ConlluPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> {
+
+ public static final String CONLLU_FORMAT = "conllu";
+
+ interface Parameters extends BasicFormatParams {
+ @ArgumentParser.ParameterDescription(valueName = "tagset",
+ description = "u|x u for unified tags and x for language-specific part-of-speech tags")
+ @ArgumentParser.OptionalParameter(defaultValue = "u")
+ String getTagset();
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(POSSample.class,
+ CONLLU_FORMAT, new ConlluPOSSampleStreamFactory(Parameters.class));
+ }
+
+ protected <P> ConlluPOSSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public ObjectStream<POSSample> create(String[] args) {
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ ConlluTagset tagset;
+
+ switch (params.getTagset()) {
+ case "u":
+ tagset = ConlluTagset.U;
+ break;
+ case "x":
+ tagset = ConlluTagset.X;
+ break;
+ default:
+ throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset());
+ }
+
+ InputStreamFactory inFactory =
+ CmdLineUtil.createInputStreamFactory(params.getData());
+
+ try {
+ return new ConlluPOSSampleStream(new ConlluStream(inFactory), tagset);
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
new file mode 100644
index 0000000..5d92d89
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.util.List;
+
+public class ConlluSentence {
+
+ private List<ConlluWordLine> wordLines;
+
+ ConlluSentence(List<ConlluWordLine> wordLines) {
+ this.wordLines = wordLines;
+ }
+
+ public List<ConlluWordLine> getWordLines() {
+ return wordLines;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
new file mode 100644
index 0000000..873a9ed
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * The CoNNL-U Format is specified here:
+ * http://universaldependencies.org/format.html
+ */
+public class ConlluStream implements ObjectStream<ConlluSentence> {
+ private final ObjectStream<String> sentenceStream;
+
+ public ConlluStream(InputStreamFactory in) throws IOException {
+ this.sentenceStream = new ParagraphStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+ }
+
+ @Override
+ public ConlluSentence read() throws IOException {
+ String sentence = sentenceStream.read();
+
+ if (sentence != null) {
+ List<ConlluWordLine> wordLines = new ArrayList<>();
+
+ BufferedReader reader = new BufferedReader(new StringReader(sentence));
+
+ String line;
+ while ((line = reader.readLine()) != null) {
+ // # indicates a comment line and should be skipped
+ if (!line.trim().startsWith("#")) {
+ wordLines.add(new ConlluWordLine(line));
+ }
+ }
+
+ return new ConlluSentence(wordLines);
+ }
+
+ return null;
+ }
+
+ @Override
+ public void close() throws IOException {
+ sentenceStream.close();
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ sentenceStream.reset();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
new file mode 100644
index 0000000..f49f3fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+enum ConlluTagset {
+ U,
+ X
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
new file mode 100644
index 0000000..9881bf1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class ConlluWordLine {
+
+ private final String id;
+ private final String form;
+ private final String lemma;
+ private final String uPosTag;
+ private final String xPosTag;
+ private final String feats;
+ private final String head;
+ private final String deprel;
+ private final String deps;
+ private final String misc;
+
+ ConlluWordLine(String line) throws InvalidFormatException {
+
+ String[] fields = line.split("\t");
+
+ if (fields.length != 10) {
+ throw new InvalidFormatException("Line must have exactly 10 fields");
+ }
+
+ id = fields[0];
+ form = fields[1];
+ lemma = fields[2];
+ uPosTag = fields[3];
+ xPosTag = fields[4];
+ feats = fields[5];
+ head = fields[6];
+ deprel = fields[7];
+ deps = fields[8];
+ misc = fields[9];
+ }
+
+ /**
+ * Retrieves the word index. An Integer starting at 1 for each new sentence;
+ * may be a range for multiword tokens; may be a decimal number for empty nodes.
+ */
+ public String getId() {
+ return id;
+ }
+
+ /**
+ * Retrieve the word form or punctuation symbol.
+ */
+ public String getForm() {
+ return form;
+ }
+
+ /**
+ * Retrieve the lemma or stem of the word form.
+ */
+ public String getLemma() {
+ return lemma;
+ }
+
+ /**
+ * Retrieve the Universal part-of-speech tag or the language-specific part-of-speech tag;
+ * underscore if not available.
+ *
+ * @param tagset the type of tag to retrieve, either universial (u) or language specific (x)
+ */
+ public String getPosTag(ConlluTagset tagset) {
+ switch (tagset) {
+ case U:
+ return uPosTag;
+ case X:
+ return xPosTag;
+ default:
+ throw new IllegalStateException("Unexpected tagset value: " + tagset);
+ }
+ }
+
+ /**
+ * Retrieve list of morphological features from the universal feature inventory or from a
+ * defined language-specific extension; underscore if not available.
+ */
+ public String getFeats() {
+ return feats;
+ }
+
+ /**
+ * Head of the current word, which is either a value of ID or zero (0).
+ */
+ public String getHead() {
+ return head;
+ }
+
+ /**
+ * Universal dependency relation to the HEAD (root iff HEAD = 0) or a
+ * defined language-specific subtype of one.
+ */
+ public String getDeprel() {
+ return deprel;
+ }
+
+ /**
+ * Enhanced dependency graph in the form of a list of head-deprel pairs.
+ */
+ public String getDeps() {
+ return deps;
+ }
+
+ /**
+ * Retrieve any other annotation.
+ */
+ public String getMisc() {
+ return misc;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/740b6e34/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
new file mode 100644
index 0000000..4676f6f
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class ConlluWordLineTest {
+
+ @Test
+ public void testParseLine() throws InvalidFormatException {
+ ConlluWordLine line = new ConlluWordLine(
+ "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
+
+ Assert.assertEquals("12", line.getId());
+ Assert.assertEquals("Händen", line.getForm());
+ Assert.assertEquals("Hand", line.getLemma());
+ Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U));
+ Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X));
+ Assert.assertEquals("Case=Dat|Number=Plur", line.getFeats());
+ Assert.assertEquals("5", line.getHead());
+ Assert.assertEquals("nmod", line.getDeprel());
+ Assert.assertEquals("_", line.getDeps());
+ Assert.assertEquals("_", line.getMisc());
+ }
+}