You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/06 10:09:56 UTC
[12/21] opennlp git commit: OPENNLP-1050: Add formats support for
Irish Sentence Bank
OPENNLP-1050: Add formats support for Irish Sentence Bank
closes #191
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6f80a897
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6f80a897
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6f80a897
Branch: refs/heads/LangDetect
Commit: 6f80a89705d84dd74da902d512ca4682aed07a57
Parents: 5bf5366
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sun Apr 30 21:25:03 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 16:52:42 2017 +0200
----------------------------------------------------------------------
.../tools/cmdline/StreamFactoryRegistry.java | 5 +
.../IrishSentenceBankDocument.java | 271 +++++++++++++++++++
.../IrishSentenceBankSentenceStream.java | 72 +++++
.../IrishSentenceBankSentenceStreamFactory.java | 61 +++++
.../IrishSentenceBankTokenSampleStream.java | 52 ++++
...ishSentenceBankTokenSampleStreamFactory.java | 60 ++++
.../IrishSentenceBankDocumentTest.java | 67 +++++
.../irishsentencebank-sample.xml | 25 ++
8 files changed, 613 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 2cff212..3d68945 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -54,6 +54,8 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory;
import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory;
import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
+import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
+import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -119,6 +121,9 @@ public final class StreamFactoryRegistry {
ConlluSentenceSampleStreamFactory.registerFactory();
ConlluPOSSampleStreamFactory.registerFactory();
ConlluLemmaSampleStreamFactory.registerFactory();
+
+ IrishSentenceBankSentenceStreamFactory.registerFactory();
+ IrishSentenceBankTokenSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
new file mode 100644
index 0000000..91ab650
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.StringBuilder;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.Span;
+
+/**
+ * A structure to hold an Irish Sentence Bank document, which is a collection
+ * of tokenized sentences.
+ * <p>
+ * The sentence bank can be downloaded from, and is described
+ * <a href="http://www.lexiconista.com/datasets/sentencebank-ga/">here</a>
+ */
+public class IrishSentenceBankDocument {
+
+ public static class IrishSentenceBankFlex {
+ String surface;
+ String[] flex;
+ public String getSurface() {
+ return surface;
+ }
+ public String[] getFlex() {
+ return flex;
+ }
+ public IrishSentenceBankFlex(String sf, String[] fl) {
+ this.surface = sf;
+ this.flex = fl;
+ }
+ }
+
+ public static class IrishSentenceBankSentence {
+ private String source;
+ private String translation;
+ private String original;
+ private Span[] tokens;
+ private IrishSentenceBankFlex[] flex;
+ public String getSource() {
+ return source;
+ }
+ public String getTranslation() {
+ return translation;
+ }
+ public String getOriginal() {
+ return original;
+ }
+ public Span[] getTokens() {
+ return tokens;
+ }
+ public IrishSentenceBankFlex[] getFlex() {
+ return flex;
+ }
+ public TokenSample getTokenSample() {
+ return new TokenSample(original, tokens);
+ }
+ public IrishSentenceBankSentence(String src, String trans, String orig,
+ Span[] toks, IrishSentenceBankFlex[] flx) {
+ this.source = src;
+ this.translation = trans;
+ this.original = orig;
+ this.tokens = toks;
+ this.flex = flx;
+ }
+ }
+
+ private List<IrishSentenceBankSentence> sentences;
+
+ public IrishSentenceBankDocument() {
+ sentences = new ArrayList<IrishSentenceBankSentence>();
+ }
+
+ public void add(IrishSentenceBankSentence sent) {
+ this.sentences.add(sent);
+ }
+
+ public List<IrishSentenceBankSentence> getSentences() {
+ return Collections.unmodifiableList(sentences);
+ }
+
+ /**
+ * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string
+ * @param s the string to check
+ * @param start the offset of the start of the string
+ * @return the offset adjusted to ignore spaces to the left
+ */
+ private static int advanceLeft(String s, int start) {
+ int ret = start;
+ for (char c : s.toCharArray()) {
+ if (c == ' ') {
+ ret++;
+ } else {
+ return ret;
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string
+ * @param s the string to check
+ * @param start the offset of the start of the string
+ * @return the offset of the end of the string, adjusted to ignore spaces to the right
+ */
+ private static int advanceRight(String s, int start) {
+ int end = s.length() - 1;
+ int ret = start + end + 1;
+ for (int i = end; i > 0; i--) {
+ if (s.charAt(i) == ' ') {
+ ret--;
+ } else {
+ return ret;
+ }
+ }
+ return ret;
+ }
+
+ public static IrishSentenceBankDocument parse(InputStream is) throws IOException {
+ IrishSentenceBankDocument document = new IrishSentenceBankDocument();
+
+ try {
+ DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
+ Document doc = docBuilder.parse(is);
+
+ String root = doc.getDocumentElement().getNodeName();
+ if (!root.equalsIgnoreCase("sentences")) {
+ throw new IOException("Expected root node " + root);
+ }
+
+ NodeList nl = doc.getDocumentElement().getChildNodes();
+ for (int i = 0; i < nl.getLength(); i++) {
+ Node sentnode = nl.item(i);
+ if (sentnode.getNodeName().equals("sentence")) {
+ String src = sentnode.getAttributes().getNamedItem("source").getNodeValue();
+ String trans = "";
+ Map<Integer, String> toks = new HashMap<>();
+ Map<Integer, List<String>> flx = new HashMap<>();
+ List<Span> spans = new ArrayList<>();
+ NodeList sentnl = sentnode.getChildNodes();
+ int flexes = 1;
+ StringBuilder orig = new StringBuilder();
+
+ for (int j = 0; j < sentnl.getLength(); j++) {
+ final String name = sentnl.item(j).getNodeName();
+ switch (name) {
+ case "flex":
+ String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue();
+ Integer flexslot = Integer.parseInt(slottmpa);
+ if (flexslot > flexes) {
+ flexes = flexslot;
+ }
+
+ flx.computeIfAbsent(flexslot, k -> new ArrayList<>());
+ String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue();
+ flx.get(flexslot).add(tkn);
+ break;
+
+ case "translation":
+ trans = sentnl.item(j).getFirstChild().getTextContent();
+ break;
+
+ case "original":
+ int last = 0;
+ NodeList orignl = sentnl.item(j).getChildNodes();
+ for (int k = 0; k < orignl.getLength(); k++) {
+ switch (orignl.item(k).getNodeName()) {
+ case "token":
+ String tmptok = orignl.item(k).getFirstChild().getTextContent();
+ spans.add(new Span(last, last + tmptok.length()));
+
+ String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue();
+ Integer tokslot = Integer.parseInt(slottmpb);
+ if (tokslot > flexes) {
+ flexes = tokslot;
+ }
+
+ toks.put(tokslot, tmptok);
+ orig.append(tmptok);
+ last += tmptok.length();
+ break;
+
+ case "#text":
+ String tmptxt = orignl.item(k).getTextContent();
+ orig.append(tmptxt);
+
+ if (!" ".equals(tmptxt)) {
+ spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last)));
+ }
+
+ last += tmptxt.length();
+ break;
+
+ default:
+ throw new IOException("Unexpected node: " + orignl.item(k).getNodeName());
+ }
+ }
+ break;
+
+ case "#text":
+ case "#comment":
+ break;
+
+ default:
+ throw new IOException("Unexpected node: " + name);
+ }
+ }
+ IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes];
+ for (Integer flexidx : toks.keySet()) {
+ String left = toks.get(flexidx);
+ int rsize = flx.get(flexidx).size();
+ String[] right = new String[rsize];
+ right = flx.get(flexidx).toArray(right);
+ flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right);
+ }
+
+ Span[] spanout = new Span[spans.size()];
+ spanout = spans.toArray(spanout);
+ document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa));
+ } else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) {
+ throw new IOException("Unexpected node: " + sentnode.getNodeName());
+ }
+ }
+ return document;
+ } catch (ParserConfigurationException e) {
+ throw new IllegalStateException(e);
+ } catch (SAXException e) {
+ throw new IOException("Failed to parse IrishSentenceBank document", e);
+ }
+ }
+
+ static IrishSentenceBankDocument parse(File file) throws IOException {
+ try (InputStream in = new FileInputStream(file)) {
+ return parse(in);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
new file mode 100644
index 0000000..e7c06d1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+class IrishSentenceBankSentenceStream implements ObjectStream<SentenceSample> {
+
+ private final IrishSentenceBankDocument source;
+
+ private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt;
+
+ IrishSentenceBankSentenceStream(IrishSentenceBankDocument source) {
+ this.source = source;
+ reset();
+ }
+
+ @Override
+ public SentenceSample read() throws IOException {
+
+ StringBuilder sentencesString = new StringBuilder();
+ List<Span> sentenceSpans = new LinkedList<>();
+
+ while (sentenceIt.hasNext()) {
+ IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next();
+
+ int begin = sentencesString.length();
+
+ if (sentence.getOriginal() != null) {
+ sentencesString.append(sentence.getOriginal());
+ }
+
+ sentenceSpans.add(new Span(begin, sentencesString.length()));
+ sentencesString.append(' ');
+ }
+
+ // end of stream is reached, indicate that with null return value
+ if (sentenceSpans.size() == 0) {
+ return null;
+ }
+
+ return new SentenceSample(sentencesString.toString(),
+ sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+ }
+
+ @Override
+ public void reset() {
+ sentenceIt = source.getSentences().iterator();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
new file mode 100644
index 0000000..e26dc56
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class IrishSentenceBankSentenceStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+ interface Parameters extends BasicFormatParams {
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(SentenceSample.class,
+ "irishsentencebank", new IrishSentenceBankSentenceStreamFactory(
+ IrishSentenceBankSentenceStreamFactory.Parameters.class));
+ }
+
+ protected <P> IrishSentenceBankSentenceStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ @Override
+ public ObjectStream<SentenceSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ CmdLineUtil.checkInputFile("Data", params.getData());
+
+ IrishSentenceBankDocument isbDoc = null;
+ try {
+ isbDoc = IrishSentenceBankDocument.parse(params.getData());
+ } catch (IOException ex) {
+ CmdLineUtil.handleCreateObjectStreamError(ex);
+ }
+
+ return new IrishSentenceBankSentenceStream(isbDoc);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
new file mode 100644
index 0000000..8cbfac2
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+class IrishSentenceBankTokenSampleStream implements ObjectStream<TokenSample> {
+
+ private final IrishSentenceBankDocument source;
+
+ private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt;
+
+ IrishSentenceBankTokenSampleStream(IrishSentenceBankDocument source) {
+ this.source = source;
+ reset();
+ }
+
+ @Override
+ public TokenSample read() throws IOException {
+
+ if (sentenceIt.hasNext()) {
+ IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next();
+ return sentence.getTokenSample();
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public void reset() {
+ sentenceIt = source.getSentences().iterator();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
new file mode 100644
index 0000000..86d1225
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.DetokenizerSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+public class IrishSentenceBankTokenSampleStreamFactory extends DetokenizerSampleStreamFactory<TokenSample> {
+
+ interface Parameters extends BasicFormatParams {
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(TokenSample.class,
+ "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory(
+ IrishSentenceBankTokenSampleStreamFactory.Parameters.class));
+ }
+
+ protected <P> IrishSentenceBankTokenSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public ObjectStream<TokenSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ CmdLineUtil.checkInputFile("Data", params.getData());
+
+ IrishSentenceBankDocument isbDoc = null;
+ try {
+ isbDoc = IrishSentenceBankDocument.parse(params.getData());
+ } catch (IOException ex) {
+ CmdLineUtil.handleCreateObjectStreamError(ex);
+ }
+
+ return new IrishSentenceBankTokenSampleStream(isbDoc);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
new file mode 100644
index 0000000..671fea0
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.Span;
+
+public class IrishSentenceBankDocumentTest {
+
+ @Test
+ public void testParsingSimpleDoc() throws IOException {
+ try (InputStream irishSBXmlIn =
+ IrishSentenceBankDocumentTest.class.getResourceAsStream("irishsentencebank-sample.xml")) {
+
+ IrishSentenceBankDocument doc = IrishSentenceBankDocument.parse(irishSBXmlIn);
+
+ List<IrishSentenceBankDocument.IrishSentenceBankSentence> sents = doc.getSentences();
+
+ Assert.assertEquals(2, sents.size());
+
+ IrishSentenceBankDocument.IrishSentenceBankSentence sent1 = sents.get(0);
+ IrishSentenceBankDocument.IrishSentenceBankSentence sent2 = sents.get(1);
+
+ Assert.assertEquals("A Dhia, tá mé ag iompar clainne!", sent1.getOriginal());
+
+ IrishSentenceBankDocument.IrishSentenceBankFlex[] flex = sent1.getFlex();
+ Assert.assertEquals(7, flex.length);
+ Assert.assertEquals("A", flex[0].getSurface());
+ Assert.assertArrayEquals(new String[]{"a"}, flex[0].getFlex());
+
+ IrishSentenceBankDocument.IrishSentenceBankFlex[] flex2 = sent2.getFlex();
+ Assert.assertEquals("ón", flex2[4].getSurface());
+ Assert.assertArrayEquals(new String[]{"ó", "an"}, flex2[4].getFlex());
+
+ Assert.assertEquals("Excuse me, are you from the stone age?", sent2.getTranslation());
+
+ TokenSample ts = sent1.getTokenSample();
+ Span[] spans = ts.getTokenSpans();
+ Assert.assertEquals(9, spans.length);
+ Assert.assertEquals(24, spans[7].getStart());
+ Assert.assertEquals(31, spans[7].getEnd());
+ Assert.assertEquals("clainne", ts.getText().substring(spans[7].getStart(), spans[7].getEnd()));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
new file mode 100644
index 0000000..91e84c1
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
@@ -0,0 +1,25 @@
+<sentences datestamp="2015-03-10">
+<sentence source='potaL'>
+ <original xml:space="preserve"><token slot='1'>A</token> <token slot='2'>Dhia</token>, <token slot='3'>tá</token> <token slot='4'>mé</token> <token slot='5'>ag</token> <token slot='6'>iompar</token> <token slot='7'>clainne</token>!</original>
+ <translation>Oh my God, I'm pregnant!</translation>
+ <flex slot='1' lemma='a'/>
+ <flex slot='2' lemma='dia'/>
+ <flex slot='3' lemma='bí'/>
+ <flex slot='4' lemma='mé'/>
+ <flex slot='5' lemma='ag'/>
+ <flex slot='6' lemma='iompair'/>
+ <flex slot='7' lemma='clann'/>
+</sentence>
+<sentence source='potaL'>
+ <original xml:space="preserve"><token slot='1'>Gabh</token> <token slot='2'>mo</token> <token slot='3'>leithscéal</token>, <token slot='4'>an</token> <token slot='5'>ón</token> <token slot='6'>chlochaois</token> <token slot='7'>thú</token>?</original>
+ <translation>Excuse me, are you from the stone age?</translation>
+ <flex slot='1' lemma='gabh'/>
+ <flex slot='2' lemma='mo'/>
+ <flex slot='3' lemma='leithscéal'/>
+ <flex slot='4' lemma='an'/>
+ <flex slot='5' lemma='ó'/>
+ <flex slot='5' lemma='an'/>
+ <flex slot='6' lemma='clochaois'/>
+ <flex slot='7' lemma='thú'/>
+</sentence>
+</sentences>