You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/06 10:09:56 UTC

[12/21] opennlp git commit: OPENNLP-1050: Add formats support for Irish Sentence Bank

OPENNLP-1050: Add formats support for Irish Sentence Bank

closes #191


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6f80a897
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6f80a897
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6f80a897

Branch: refs/heads/LangDetect
Commit: 6f80a89705d84dd74da902d512ca4682aed07a57
Parents: 5bf5366
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sun Apr 30 21:25:03 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 16:52:42 2017 +0200

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |   5 +
 .../IrishSentenceBankDocument.java              | 271 +++++++++++++++++++
 .../IrishSentenceBankSentenceStream.java        |  72 +++++
 .../IrishSentenceBankSentenceStreamFactory.java |  61 +++++
 .../IrishSentenceBankTokenSampleStream.java     |  52 ++++
 ...ishSentenceBankTokenSampleStreamFactory.java |  60 ++++
 .../IrishSentenceBankDocumentTest.java          |  67 +++++
 .../irishsentencebank-sample.xml                |  25 ++
 8 files changed, 613 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 2cff212..3d68945 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -54,6 +54,8 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory;
 import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
+import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
+import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -119,6 +121,9 @@ public final class StreamFactoryRegistry {
     ConlluSentenceSampleStreamFactory.registerFactory();
     ConlluPOSSampleStreamFactory.registerFactory();
     ConlluLemmaSampleStreamFactory.registerFactory();
+
+    IrishSentenceBankSentenceStreamFactory.registerFactory();
+    IrishSentenceBankTokenSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
new file mode 100644
index 0000000..91ab650
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.StringBuilder;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.Span;
+
+/**
+ * A structure to hold an Irish Sentence Bank document, which is a collection
+ * of tokenized sentences.
+ * <p>
+ * The sentence bank can be downloaded from, and is described
+ * <a href="http://www.lexiconista.com/datasets/sentencebank-ga/">here</a>
+ */
+public class IrishSentenceBankDocument {
+
+  public static class IrishSentenceBankFlex {
+    String surface;
+    String[] flex;
+    public String getSurface() {
+      return surface;
+    }
+    public String[] getFlex() {
+      return flex;
+    }
+    public IrishSentenceBankFlex(String sf, String[] fl) {
+      this.surface = sf;
+      this.flex = fl;
+    }
+  }
+
+  public static class IrishSentenceBankSentence {
+    private String source;
+    private String translation;
+    private String original;
+    private Span[] tokens;
+    private IrishSentenceBankFlex[] flex;
+    public String getSource() {
+      return source;
+    }
+    public String getTranslation() {
+      return translation;
+    }
+    public String getOriginal() {
+      return original;
+    }
+    public Span[] getTokens() {
+      return tokens;
+    }
+    public IrishSentenceBankFlex[] getFlex() {
+      return flex;
+    }
+    public TokenSample getTokenSample() {
+      return new TokenSample(original, tokens);
+    }
+    public IrishSentenceBankSentence(String src, String trans, String orig, 
+                                     Span[] toks, IrishSentenceBankFlex[] flx) {
+      this.source = src;
+      this.translation = trans;
+      this.original = orig;
+      this.tokens = toks;
+      this.flex = flx;
+    }
+  }
+
+  private List<IrishSentenceBankSentence> sentences;
+
+  public IrishSentenceBankDocument() {
+    sentences = new ArrayList<IrishSentenceBankSentence>();
+  }
+
+  public void add(IrishSentenceBankSentence sent) {
+    this.sentences.add(sent);
+  }
+
+  public List<IrishSentenceBankSentence> getSentences() {
+    return Collections.unmodifiableList(sentences);
+  }
+
+  /**
+   * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string
+   * @param s the string to check
+   * @param start the offset of the start of the string
+   * @return the offset adjusted to ignore spaces to the left
+   */
+  private static int advanceLeft(String s, int start) {
+    int ret = start;
+    for (char c : s.toCharArray()) {
+      if (c == ' ') {
+        ret++;
+      } else {
+        return ret;
+      }
+    }
+    return ret;
+  }
+
+  /**
+   * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string
+   * @param s the string to check
+   * @param start the offset of the start of the string
+   * @return the offset of the end of the string, adjusted to ignore spaces to the right
+   */
+  private static int advanceRight(String s, int start) {
+    int end = s.length() - 1;
+    int ret = start + end + 1;
+    for (int i = end; i > 0; i--) {
+      if (s.charAt(i) == ' ') {
+        ret--;
+      } else {
+        return ret;
+      }
+    }
+    return ret;
+  }
+
+  public static IrishSentenceBankDocument parse(InputStream is) throws IOException {
+    IrishSentenceBankDocument document = new IrishSentenceBankDocument();
+
+    try {
+      DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
+      Document doc = docBuilder.parse(is);
+
+      String root = doc.getDocumentElement().getNodeName();
+      if (!root.equalsIgnoreCase("sentences")) {
+        throw new IOException("Expected root node " + root);
+      }
+
+      NodeList nl = doc.getDocumentElement().getChildNodes();
+      for (int i = 0; i < nl.getLength(); i++) {
+        Node sentnode = nl.item(i);
+        if (sentnode.getNodeName().equals("sentence")) {
+          String src = sentnode.getAttributes().getNamedItem("source").getNodeValue();
+          String trans = "";
+          Map<Integer, String> toks = new HashMap<>();
+          Map<Integer, List<String>> flx = new HashMap<>();
+          List<Span> spans = new ArrayList<>();
+          NodeList sentnl = sentnode.getChildNodes();
+          int flexes = 1;
+          StringBuilder orig = new StringBuilder();
+
+          for (int j = 0; j < sentnl.getLength(); j++) {
+            final String name = sentnl.item(j).getNodeName();
+            switch (name) {
+              case "flex":
+                String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue();
+                Integer flexslot = Integer.parseInt(slottmpa);
+                if (flexslot > flexes) {
+                  flexes = flexslot;
+                }
+
+                flx.computeIfAbsent(flexslot, k -> new ArrayList<>());
+                String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue();
+                flx.get(flexslot).add(tkn);
+                break;
+
+              case "translation":
+                trans = sentnl.item(j).getFirstChild().getTextContent();
+                break;
+
+              case "original":
+                int last = 0;
+                NodeList orignl = sentnl.item(j).getChildNodes();
+                for (int k = 0; k < orignl.getLength(); k++) {
+                  switch (orignl.item(k).getNodeName()) {
+                    case "token":
+                      String tmptok = orignl.item(k).getFirstChild().getTextContent();
+                      spans.add(new Span(last, last + tmptok.length()));
+
+                      String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue();
+                      Integer tokslot = Integer.parseInt(slottmpb);
+                      if (tokslot > flexes) {
+                        flexes = tokslot;
+                      }
+
+                      toks.put(tokslot, tmptok);
+                      orig.append(tmptok);
+                      last += tmptok.length();
+                      break;
+
+                    case "#text":
+                      String tmptxt = orignl.item(k).getTextContent();
+                      orig.append(tmptxt);
+
+                      if (!" ".equals(tmptxt)) {
+                        spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last)));
+                      }
+
+                      last += tmptxt.length();
+                      break;
+
+                    default:
+                      throw new IOException("Unexpected node: " + orignl.item(k).getNodeName());
+                  }
+                }
+                break;
+
+              case "#text":
+              case "#comment":
+                break;
+
+              default:
+                throw new IOException("Unexpected node: " + name);
+            }
+          }
+          IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes];
+          for (Integer flexidx : toks.keySet()) {
+            String left = toks.get(flexidx);
+            int rsize = flx.get(flexidx).size();
+            String[] right = new String[rsize];
+            right = flx.get(flexidx).toArray(right);
+            flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right);
+          }
+
+          Span[] spanout = new Span[spans.size()];
+          spanout = spans.toArray(spanout);
+          document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa));
+        } else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) {
+          throw new IOException("Unexpected node: " + sentnode.getNodeName());
+        }
+      }
+      return document;
+    } catch (ParserConfigurationException e) {
+      throw new IllegalStateException(e);
+    } catch (SAXException e) {
+      throw new IOException("Failed to parse IrishSentenceBank document", e);
+    }
+  }
+
+  static IrishSentenceBankDocument parse(File file) throws IOException {
+    try (InputStream in = new FileInputStream(file)) {
+      return parse(in);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
new file mode 100644
index 0000000..e7c06d1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+class IrishSentenceBankSentenceStream implements ObjectStream<SentenceSample>  {
+
+  private final IrishSentenceBankDocument source;
+
+  private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt;
+
+  IrishSentenceBankSentenceStream(IrishSentenceBankDocument source) {
+    this.source = source;
+    reset();
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+
+    StringBuilder sentencesString = new StringBuilder();
+    List<Span> sentenceSpans = new LinkedList<>();
+
+    while (sentenceIt.hasNext()) {
+      IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next();
+
+      int begin = sentencesString.length();
+
+      if (sentence.getOriginal() != null) {
+        sentencesString.append(sentence.getOriginal());
+      }
+
+      sentenceSpans.add(new Span(begin, sentencesString.length()));
+      sentencesString.append(' ');
+    }
+
+    // end of stream is reached, indicate that with null return value
+    if (sentenceSpans.size() == 0) {
+      return null;
+    }
+
+    return new SentenceSample(sentencesString.toString(),
+        sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+  }
+
+  @Override
+  public void reset() {
+    sentenceIt = source.getSentences().iterator();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
new file mode 100644
index 0000000..e26dc56
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class IrishSentenceBankSentenceStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        "irishsentencebank", new IrishSentenceBankSentenceStreamFactory(
+        IrishSentenceBankSentenceStreamFactory.Parameters.class));
+  }
+
+  protected <P> IrishSentenceBankSentenceStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+
+    IrishSentenceBankDocument isbDoc = null;
+    try {
+      isbDoc = IrishSentenceBankDocument.parse(params.getData());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new IrishSentenceBankSentenceStream(isbDoc);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
new file mode 100644
index 0000000..8cbfac2
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+class IrishSentenceBankTokenSampleStream implements ObjectStream<TokenSample>  {
+
+  private final IrishSentenceBankDocument source;
+
+  private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt;
+
+  IrishSentenceBankTokenSampleStream(IrishSentenceBankDocument source) {
+    this.source = source;
+    reset();
+  }
+
+  @Override
+  public TokenSample read() throws IOException {
+
+    if (sentenceIt.hasNext()) {
+      IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next();
+      return sentence.getTokenSample();
+    } else {
+      return null;
+    }
+  }
+
+  @Override
+  public void reset() {
+    sentenceIt = source.getSentences().iterator();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
new file mode 100644
index 0000000..86d1225
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.DetokenizerSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+public class IrishSentenceBankTokenSampleStreamFactory extends DetokenizerSampleStreamFactory<TokenSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(TokenSample.class,
+        "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory(
+        IrishSentenceBankTokenSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> IrishSentenceBankTokenSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<TokenSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+
+    IrishSentenceBankDocument isbDoc = null;
+    try {
+      isbDoc = IrishSentenceBankDocument.parse(params.getData());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new IrishSentenceBankTokenSampleStream(isbDoc);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
new file mode 100644
index 0000000..671fea0
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.Span;
+
+public class IrishSentenceBankDocumentTest {
+
+  @Test
+  public void testParsingSimpleDoc() throws IOException {
+    try (InputStream irishSBXmlIn = 
+          IrishSentenceBankDocumentTest.class.getResourceAsStream("irishsentencebank-sample.xml")) {
+
+      IrishSentenceBankDocument doc = IrishSentenceBankDocument.parse(irishSBXmlIn);
+
+      List<IrishSentenceBankDocument.IrishSentenceBankSentence> sents = doc.getSentences();
+
+      Assert.assertEquals(2, sents.size());
+
+      IrishSentenceBankDocument.IrishSentenceBankSentence sent1 = sents.get(0);
+      IrishSentenceBankDocument.IrishSentenceBankSentence sent2 = sents.get(1);
+
+      Assert.assertEquals("A Dhia, tá mé ag iompar clainne!", sent1.getOriginal());
+
+      IrishSentenceBankDocument.IrishSentenceBankFlex[] flex = sent1.getFlex();
+      Assert.assertEquals(7, flex.length);
+      Assert.assertEquals("A", flex[0].getSurface());
+      Assert.assertArrayEquals(new String[]{"a"}, flex[0].getFlex());
+
+      IrishSentenceBankDocument.IrishSentenceBankFlex[] flex2 = sent2.getFlex();
+      Assert.assertEquals("ón", flex2[4].getSurface());
+      Assert.assertArrayEquals(new String[]{"ó", "an"}, flex2[4].getFlex());
+
+      Assert.assertEquals("Excuse me, are you from the stone age?", sent2.getTranslation());
+
+      TokenSample ts = sent1.getTokenSample();
+      Span[] spans = ts.getTokenSpans();
+      Assert.assertEquals(9, spans.length);
+      Assert.assertEquals(24, spans[7].getStart());
+      Assert.assertEquals(31, spans[7].getEnd());
+      Assert.assertEquals("clainne", ts.getText().substring(spans[7].getStart(), spans[7].getEnd()));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
new file mode 100644
index 0000000..91e84c1
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
@@ -0,0 +1,25 @@
+<sentences datestamp="2015-03-10">
+<sentence source='potaL'>
+	<original xml:space="preserve"><token slot='1'>A</token> <token slot='2'>Dhia</token>, <token slot='3'>tá</token> <token slot='4'>mé</token> <token slot='5'>ag</token> <token slot='6'>iompar</token> <token slot='7'>clainne</token>!</original>
+	<translation>Oh my God, I&apos;m pregnant!</translation>
+	<flex slot='1' lemma='a'/>
+	<flex slot='2' lemma='dia'/>
+	<flex slot='3' lemma='bí'/>
+	<flex slot='4' lemma='mé'/>
+	<flex slot='5' lemma='ag'/>
+	<flex slot='6' lemma='iompair'/>
+	<flex slot='7' lemma='clann'/>
+</sentence>
+<sentence source='potaL'>
+	<original xml:space="preserve"><token slot='1'>Gabh</token> <token slot='2'>mo</token> <token slot='3'>leithscéal</token>, <token slot='4'>an</token> <token slot='5'>ón</token> <token slot='6'>chlochaois</token> <token slot='7'>thú</token>?</original>
+	<translation>Excuse me, are you from the stone age?</translation>
+	<flex slot='1' lemma='gabh'/>
+	<flex slot='2' lemma='mo'/>
+	<flex slot='3' lemma='leithscéal'/>
+	<flex slot='4' lemma='an'/>
+	<flex slot='5' lemma='ó'/>
+	<flex slot='5' lemma='an'/>
+	<flex slot='6' lemma='clochaois'/>
+	<flex slot='7' lemma='thú'/>
+</sentence>
+</sentences>