You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/12/20 10:01:42 UTC

[opennlp] branch master updated: OPENNLP-1130 Sentence detector format support for NKJP

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new cdcb989  OPENNLP-1130 Sentence detector format support for NKJP
cdcb989 is described below

commit cdcb989749cdc56926c1c99c2c75372f105c92bc
Author: Jim O'Regan <ja...@tcd.ie>
AuthorDate: Mon Sep 11 01:56:48 2017 +0100

    OPENNLP-1130 Sentence detector format support for NKJP
---
 .../tools/cmdline/StreamFactoryRegistry.java       |   2 +
 .../formats/nkjp/NKJPSegmentationDocument.java     | 260 +++++++++++++++++++++
 .../formats/nkjp/NKJPSentenceSampleStream.java     | 105 +++++++++
 .../nkjp/NKJPSentenceSampleStreamFactory.java      |  69 ++++++
 .../tools/formats/nkjp/NKJPTextDocument.java       | 182 +++++++++++++++
 .../formats/nkjp/NKJPSegmentationDocumentTest.java |  48 ++++
 .../tools/formats/nkjp/NKJPTextDocumentTest.java   |  57 +++++
 .../tools/formats/nkjp/ann_segmentation.xml        |  20 ++
 .../opennlp/tools/formats/nkjp/text_structure.xml  |  24 ++
 9 files changed, 767 insertions(+)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 58bd87b..db95a4f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -61,6 +61,7 @@ import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
+import opennlp.tools.formats.nkjp.NKJPSentenceSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesPOSSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStreamFactory;
@@ -128,6 +129,7 @@ public final class StreamFactoryRegistry {
     IrishSentenceBankSentenceStreamFactory.registerFactory();
     IrishSentenceBankTokenSampleStreamFactory.registerFactory();
     LeipzigLanguageSampleStreamFactory.registerFactory();
+    NKJPSentenceSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
new file mode 100644
index 0000000..b532bd9
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.util.Span;
+import opennlp.tools.util.XmlUtil;
+
+public class NKJPSegmentationDocument {
+
+  public static class Pointer {
+    String doc;
+    String id;
+    int offset;
+    int length;
+    boolean space_after;
+
+    public Pointer(String doc, String id, int offset, int length, boolean space_after) {
+      this.doc = doc;
+      this.id = id;
+      this.offset = offset;
+      this.length = length;
+      this.space_after = space_after;
+    }
+
+    public Span toSpan() {
+      return new Span(this.offset, this.offset + this.length);
+    }
+
+    @Override
+    public String toString() {
+      return doc + "#string-range(" + id + "," + Integer.toString(offset)
+          + "," + Integer.toString(length) + ")";
+    }
+  }
+
+  public Map<String, Map<String, Pointer>> getSegments() {
+    return segments;
+  }
+
+  Map<String, Map<String, Pointer>> segments;
+
+  NKJPSegmentationDocument() {
+    this.segments = new LinkedHashMap<>();
+  }
+
+  NKJPSegmentationDocument(Map<String, Map<String, Pointer>> segments) {
+    this();
+    this.segments = segments;
+  }
+
+  public static NKJPSegmentationDocument parse(InputStream is) throws IOException {
+
+    Map<String, Map<String, Pointer>> sentences = new LinkedHashMap<>();
+
+    try {
+      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();;
+      Document doc = docBuilder.parse(is);
+
+      XPathFactory xPathfactory = XPathFactory.newInstance();
+      XPath xpath = xPathfactory.newXPath();
+
+      final XPathExpression SENT_NODES = xpath.compile("/teiCorpus/TEI/text/body/p/s");
+      final XPathExpression SEG_NODES = xpath.compile("./seg|./choice");
+      final XPathExpression SEG_NODES_ONLY = xpath.compile("./seg");
+
+      NodeList nl = (NodeList) SENT_NODES.evaluate(doc, XPathConstants.NODESET);
+
+      for (int i = 0; i < nl.getLength(); i++) {
+        Node sentnode = nl.item(i);
+
+        String sentid = null;
+        if (sentnode.getAttributes().getNamedItem("xml:id") != null) {
+          sentid = sentnode.getAttributes().getNamedItem("xml:id").getTextContent();
+        }
+
+        Map<String, Pointer> segments = new LinkedHashMap<>();
+        NodeList segnl = (NodeList) SEG_NODES.evaluate(sentnode, XPathConstants.NODESET);
+
+        for (int j = 0; j < segnl.getLength(); j++) {
+          Node n = segnl.item(j);
+          if (n.getNodeName().equals("seg")) {
+            String segid = xmlID(n);
+            Pointer pointer = fromSeg(n);
+            segments.put(segid, pointer);
+          } else if (n.getNodeName().equals("choice")) {
+            boolean have_seg = false;
+            if (have_seg) {
+              continue;
+            }
+
+            NodeList choices = n.getChildNodes();
+
+            for (int k = 0; k < choices.getLength(); k++) {
+              if (choices.item(k).getNodeName().equals("nkjp:paren")) {
+                if (!checkRejectedParen(choices.item(k))) {
+                  NodeList paren_segs = (NodeList) SEG_NODES_ONLY.evaluate(choices.item(k),
+                      XPathConstants.NODESET);
+                  have_seg = true;
+
+                  for (int l = 0; l < paren_segs.getLength(); l++) {
+                    String segid = xmlID(paren_segs.item(l));
+                    Pointer pointer = fromSeg(paren_segs.item(l));
+                    segments.put(segid, pointer);
+                  }
+                }
+              } else if (choices.item(k).getNodeName().equals("seg")) {
+                if (!checkRejected(choices.item(k))) {
+                  have_seg = true;
+                  String segid = xmlID(choices.item(k));
+                  Pointer pointer = fromSeg(choices.item(k));
+                  segments.put(segid, pointer);
+                }
+              }
+            }
+          }
+        }
+
+        sentences.put(sentid, segments);
+      }
+
+    } catch (SAXException | XPathExpressionException | IOException e) {
+      throw new IOException("Failed to parse NKJP document", e);
+    }
+
+    return new NKJPSegmentationDocument(sentences);
+  }
+
+  static boolean checkRejected(Node n) {
+    if (n.getAttributes() == null) {
+      return false;
+    }
+    if (n.getAttributes().getNamedItem("nkjp:rejected") == null) {
+      return false;
+    }
+    String rejected = n.getAttributes().getNamedItem("nkjp:rejected").getTextContent();
+    return rejected.equals("true");
+  }
+
+  static boolean checkRejectedParen(Node n) {
+    if (n.getChildNodes().getLength() == 0) {
+      return false;
+    } else {
+      for (int i = 0; i < n.getChildNodes().getLength(); i++) {
+        Node m = n.getChildNodes().item(i);
+        if (m.getNodeName().equals("seg")) {
+          if (!checkRejected(m)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+  }
+
+  static String xmlID(Node n) throws IOException {
+    if (n.getAttributes() == null || n.getAttributes().getLength() < 1) {
+      throw new IOException("Missing required attributes");
+    }
+
+    String id = null;
+    if (n.getAttributes().getNamedItem("xml:id") != null) {
+      id = n.getAttributes().getNamedItem("xml:id").getTextContent();
+    }
+
+    if (id == null) {
+      throw new IOException("Missing xml:id attribute");
+    }
+
+    return id;
+  }
+
+  static Pointer fromSeg(Node n) throws IOException {
+    if (n.getAttributes() == null || n.getAttributes().getLength() < 2) {
+      throw new IOException("Missing required attributes");
+    }
+
+    String ptr = null;
+    if (n.getAttributes().getNamedItem("corresp") != null) {
+      ptr = n.getAttributes().getNamedItem("corresp").getTextContent();
+    }
+    String spacing = "";
+    if (n.getAttributes().getNamedItem("nkjp:nps") != null) {
+      spacing = n.getAttributes().getNamedItem("nkjp:nps").getTextContent();
+    }
+
+    if (ptr == null) {
+      throw new IOException("Missing required attribute");
+    }
+
+    boolean space_after = (ptr.equals("yes"));
+
+    if (!ptr.contains("#") || !ptr.contains("(") || ptr.charAt(ptr.length() - 1) != ')') {
+      throw new IOException("String " + ptr + " does not appear to be a valid NKJP corresp attribute");
+    }
+
+    int docend = ptr.indexOf('#');
+    String document = ptr.substring(0, docend);
+
+    int pointer_start = ptr.indexOf('(') + 1;
+    String[] pieces = ptr.substring(pointer_start, ptr.length() - 1).split(",");
+
+    if (pieces.length < 3 || pieces.length > 4) {
+      throw new IOException("String " + ptr + " does not appear to be a valid NKJP corresp attribute");
+    }
+
+    String docid = pieces[0];
+    int offset = 0;
+    int length = 0;
+    if (pieces.length == 3) {
+      offset = Integer.parseInt(pieces[1]);
+      length = Integer.parseInt(pieces[2]);
+    } else {
+      int os1 = Integer.parseInt(pieces[1]);
+      int os2 = Integer.parseInt(pieces[2]);
+      offset = (os1 * 1000) + os2;
+      length = Integer.parseInt(pieces[3]);
+    }
+
+    return new Pointer(document, docid, offset, length, space_after);
+  }
+
+  static NKJPSegmentationDocument parse(File file) throws IOException {
+    try (InputStream in = new FileInputStream(file)) {
+      return parse(in);
+    }
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java
new file mode 100644
index 0000000..5f6c001
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class NKJPSentenceSampleStream implements ObjectStream<SentenceSample> {
+  private final NKJPSegmentationDocument segments;
+
+  private final NKJPTextDocument text;
+
+  private Iterator<Map.Entry<String, Map<String, NKJPSegmentationDocument.Pointer>>> segmentIt;
+
+  NKJPSentenceSampleStream(NKJPSegmentationDocument segments, NKJPTextDocument text) {
+    this.segments = segments;
+    this.text = text;
+    reset();
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+    StringBuilder sentencesString = new StringBuilder();
+    List<Span> sentenceSpans = new LinkedList<>();
+    Map<String, String> paragraphs = text.getParagraphs();
+
+    while (segmentIt.hasNext()) {
+      Map.Entry<String, Map<String, NKJPSegmentationDocument.Pointer>> segment = segmentIt.next();
+      int start = 0;
+      int end = 0;
+      boolean started = false;
+      String lastParagraphId = "";
+      String currentParagraph = "";
+
+      for (String s : segment.getValue().keySet()) {
+        NKJPSegmentationDocument.Pointer currentPointer = segment.getValue().get(s);
+        Span currentSpan = currentPointer.toSpan();
+
+        if (!started) {
+          start = currentSpan.getStart();
+          started = true;
+          lastParagraphId = currentPointer.id;
+          currentParagraph = paragraphs.get(currentPointer.id);
+        }
+
+        if (!lastParagraphId.equals(currentPointer.id)) {
+          int new_start = sentencesString.length();
+          sentencesString.append(currentParagraph.substring(start, end));
+          int new_end = sentencesString.length();
+          sentenceSpans.add(new Span(new_start, new_end));
+          sentencesString.append(' ');
+
+          start = currentSpan.getStart();
+          end = currentSpan.getEnd();
+          lastParagraphId = currentPointer.id;
+          currentParagraph = paragraphs.get(currentPointer.id);
+        } else {
+          end = currentSpan.getEnd();
+        }
+      }
+
+      int new_start = sentencesString.length();
+      sentencesString.append(currentParagraph.substring(start, end));
+      int new_end = sentencesString.length();
+      sentenceSpans.add(new Span(new_start, new_end));
+      sentencesString.append(' ');
+    }
+
+    // end of stream is reached, indicate that with null return value
+    if (sentenceSpans.size() == 0) {
+      return null;
+    }
+
+    return new SentenceSample(sentencesString.toString(),
+      sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+  }
+
+  @Override
+  public void reset() {
+    segmentIt = segments.getSegments().entrySet().iterator();
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..9a94458
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class NKJPSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "text",
+        description = "file containing NKJP text")
+    File getTextFile();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        "nkjp", new NKJPSentenceSampleStreamFactory(
+        NKJPSentenceSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> NKJPSentenceSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+
+    CmdLineUtil.checkInputFile("Text", params.getTextFile());
+
+    NKJPSegmentationDocument segDoc = null;
+    NKJPTextDocument textDoc = null;
+    try {
+      segDoc = NKJPSegmentationDocument.parse(params.getData());
+      textDoc = NKJPTextDocument.parse(params.getTextFile());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new NKJPSentenceSampleStream(segDoc, textDoc);
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
new file mode 100644
index 0000000..53421f4
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.util.XmlUtil;
+
+public class NKJPTextDocument {
+
+  Map<String, String> divtypes;
+
+  Map<String, Map<String, Map<String, String>>> texts;
+
+  NKJPTextDocument() {
+    divtypes = new HashMap<>();
+    texts = new HashMap<>();
+  }
+
+  NKJPTextDocument(Map<String, String> divtypes, Map<String, Map<String, Map<String, String>>> texts) {
+    this();
+    this.divtypes = divtypes;
+    this.texts = texts;
+  }
+
+  public static NKJPTextDocument parse(InputStream is) throws IOException {
+    Map<String, String> divtypes = new HashMap<>();
+    Map<String, Map<String, Map<String, String>>> texts = new HashMap<>();
+
+    try {
+      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();;
+      Document doc = docBuilder.parse(is);
+
+      XPathFactory xPathfactory = XPathFactory.newInstance();
+      XPath xpath = xPathfactory.newXPath();
+
+      final XPathExpression TEXT_NODES_EXAMPLE = xpath.compile("/teiCorpus/TEI/text/group/text");
+      final XPathExpression TEXT_NODES_SAMPLE = xpath.compile("/teiCorpus/TEI/text");
+      final XPathExpression DIV_NODES = xpath.compile("./body/div");
+      final XPathExpression PARA_NODES = xpath.compile("./p|./ab");
+
+      doc.getDocumentElement().normalize();
+      String root = doc.getDocumentElement().getNodeName();
+
+      if (!root.equalsIgnoreCase("teiCorpus")) {
+        throw new IOException("Expected root node " + root);
+      }
+
+      String current_text = "";
+      NodeList textnl = (NodeList) TEXT_NODES_EXAMPLE.evaluate(doc, XPathConstants.NODESET);
+      if (textnl.getLength() == 0) {
+        textnl = (NodeList) TEXT_NODES_SAMPLE.evaluate(doc, XPathConstants.NODESET);
+      }
+
+      for (int i = 0; i < textnl.getLength(); i++) {
+        Node textnode = textnl.item(i);
+        current_text = attrib(textnode, "xml:id", true);
+
+        Map<String, Map<String, String>> current_divs = new HashMap<>();
+        NodeList divnl = (NodeList) DIV_NODES.evaluate(textnode, XPathConstants.NODESET);
+        for (int j = 0; j < divnl.getLength(); j++) {
+          Node divnode = divnl.item(j);
+          String divtype = attrib(divnode, "type", false);
+          String divid = attrib(divnode, "xml:id", true);
+          divtypes.put(divid, divtype);
+
+          Map<String, String> current_paras = new HashMap<>();
+          NodeList paranl = (NodeList) PARA_NODES.evaluate(divnode, XPathConstants.NODESET);
+
+          for (int k = 0; k < paranl.getLength(); k++) {
+            Node pnode = paranl.item(k);
+            String pid = attrib(pnode, "xml:id", true);
+
+            if (pnode.getChildNodes().getLength() != 1
+                && !pnode.getFirstChild().getNodeName().equals("#text")) {
+              throw new IOException("Unexpected content in p element " + pid);
+            }
+
+            String ptext = pnode.getTextContent();
+            current_paras.put(pid, ptext);
+          }
+
+          current_divs.put(divid, current_paras);
+        }
+
+        texts.put(current_text, current_divs);
+      }
+
+    } catch (SAXException | XPathExpressionException | IOException e) {
+      throw new IOException("Failed to parse NKJP document", e);
+    }
+    return new NKJPTextDocument(divtypes, texts);
+  }
+
+  static NKJPTextDocument parse(File file) throws IOException {
+    try (InputStream in = new FileInputStream(file)) {
+      return parse(in);
+    }
+  }
+
+  Map<String, String> getDivtypes() {
+    return Collections.unmodifiableMap(this.divtypes);
+  }
+
+  Map<String, Map<String, Map<String, String>>> getTexts() {
+    return Collections.unmodifiableMap(this.texts);
+  }
+
+  /**
+   * Segmentation etc. is done only in relation to the paragraph,
+   * which are unique within a document. This is to simplify
+   * working with the paragraphs within the document
+   * @return a map of paragaph IDs and their text
+   */
+  Map<String, String> getParagraphs() {
+    Map<String, String> paragraphs = new HashMap<>();
+    for (String dockey : texts.keySet()) {
+      for (String divkey : texts.get(dockey).keySet()) {
+        for (String pkey : texts.get(dockey).get(divkey).keySet()) {
+          paragraphs.put(pkey, texts.get(dockey).get(divkey).get(pkey));
+        }
+      }
+    }
+    return paragraphs;
+  }
+
+  /**
+   * Helper method to get the value of an attribute
+   * @param n The node being processed
+   * @param attrib The name of the attribute
+   * @param required Whether or not the attribute is required
+   * @return The value of the attribute, or null if not required and not present
+   * @throws Exception
+   */
+  private static String attrib(Node n, String attrib, boolean required) throws IOException {
+    if (required && (n.getAttributes() == null || n.getAttributes().getLength() == 0)) {
+      throw new IOException("Missing required attributes in node " + n.getNodeName());
+    }
+    if (n.getAttributes().getNamedItem(attrib) != null) {
+      return n.getAttributes().getNamedItem(attrib).getTextContent();
+    } else {
+      if (required) {
+        throw new IOException("Required attribute \"" + attrib + "\" missing in node " + n.getNodeName());
+      } else {
+        return null;
+      }
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocumentTest.java
new file mode 100644
index 0000000..226b92c
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocumentTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class NKJPSegmentationDocumentTest {
+  @Test
+  public void testParsingSimpleDoc() throws IOException {
+    try (InputStream nkjpSegXmlIn =
+           NKJPSegmentationDocumentTest.class.getResourceAsStream("ann_segmentation.xml")) {
+
+      NKJPSegmentationDocument doc = NKJPSegmentationDocument.parse(nkjpSegXmlIn);
+
+      assertEquals(1, doc.getSegments().size());
+
+      assertEquals(7, doc.getSegments().get("segm_1.1-s").size());
+
+      String src = "To krótkie zdanie w drugim akapicie.";
+
+      int offset = doc.getSegments().get("segm_1.1-s").get("segm_1.1-seg").offset;
+      assertEquals(0, offset);
+      int length = doc.getSegments().get("segm_1.1-s").get("segm_1.1-seg").length;
+      assertEquals(2, length);
+      assertEquals("To", src.substring(offset, length));
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPTextDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPTextDocumentTest.java
new file mode 100644
index 0000000..760af89
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPTextDocumentTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class NKJPTextDocumentTest {
+  @Test
+  public void testParsingSimpleDoc() throws Exception {
+    try (InputStream nkjpTextXmlIn =
+           NKJPTextDocumentTest.class.getResourceAsStream("text_structure.xml")) {
+
+      NKJPTextDocument doc = NKJPTextDocument.parse(nkjpTextXmlIn);
+
+      assertEquals(1, doc.getDivtypes().size());
+      assertEquals("article", doc.getDivtypes().get("div-1"));
+
+      assertEquals(1, doc.getTexts().size());
+      assertEquals(1, doc.getTexts().get("text-1").size());
+      assertEquals(2, doc.getTexts().get("text-1").get("div-1").size());
+
+      String exp = "To krótki tekst w formacie NKJP. Zawiera dwa zdania.";
+      assertEquals(exp, doc.getTexts().get("text-1").get("div-1").get("p-1"));
+    }
+  }
+
+  @Test
+  public void testGetParagraphs() throws Exception {
+    try (InputStream nkjpTextXmlIn =
+           NKJPTextDocumentTest.class.getResourceAsStream("text_structure.xml")) {
+
+      NKJPTextDocument doc = NKJPTextDocument.parse(nkjpTextXmlIn);
+      Map<String, String> paras = doc.getParagraphs();
+      assertEquals("To krótkie zdanie w drugim akapicie.", paras.get("ab-1"));
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/ann_segmentation.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/ann_segmentation.xml
new file mode 100644
index 0000000..15cde1c
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/ann_segmentation.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<teiCorpus xmlns:xi="http://www.w3.org/2001/XInclude" xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
+    <TEI>
+        <text xml:id="segm_text" xml:lang="pl">
+            <body xml:id="segm_body">
+                <p corresp="text.xml#txt_1-div" xml:id="segm_1-p">
+                    <s xml:id="segm_1.1-s">
+                        <seg corresp="text_structure.xml#string-range(ab-1,0,2)" xml:id="segm_1.1-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,3,7)" xml:id="segm_1.2-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,11,6)" xml:id="segm_1.3-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,18,1)" xml:id="segm_1.4-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,20,6)" xml:id="segm_1.5-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,27,8)" xml:id="segm_1.6-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,35,1)" nkjp:nps="true" xml:id="segm_1.7-seg"/>
+                    </s>
+                </p>
+            </body>
+        </text>
+    </TEI>
+</teiCorpus>
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/text_structure.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/text_structure.xml
new file mode 100644
index 0000000..61a4ce1
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/text_structure.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?oxygen RNGSchema="NKJP_structure.rng" type="xml"?>
+
+<teiCorpus xmlns:xi="http://www.w3.org/2001/XInclude" xmlns="http://www.tei-c.org/ns/1.0">
+    <TEI>
+        <text>
+            <group>
+                <text decls="#bibl-1" xml:id="text-1">
+                    <front>
+                        <docTitle>
+                            <titlePart type="main" xml:id="titlePart-1">Krótki tekst</titlePart>
+                        </docTitle>
+                    </front>
+                    <body>
+                        <div type="article" xml:id="div-1">
+                            <p xml:id="p-1">To krótki tekst w formacie NKJP. Zawiera dwa zdania.</p>
+                            <ab xml:id="ab-1">To krótkie zdanie w drugim akapicie.</ab>
+                        </div>
+                    </body>
+                </text>
+            </group>
+        </text>
+    </TEI>
+</teiCorpus>

-- 
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].