You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@opennlp.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2017/12/20 10:02:00 UTC

[jira] [Commented] (OPENNLP-1130) Sentence detector format support for NKJP

    [ https://issues.apache.org/jira/browse/OPENNLP-1130?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16298201#comment-16298201 ] 

ASF GitHub Bot commented on OPENNLP-1130:
-----------------------------------------

kottmann closed pull request #263: OPENNLP-1130 Sentence detector format support for NKJP
URL: https://github.com/apache/opennlp/pull/263
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 48b80256f..cd2b4dc69 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -61,6 +61,7 @@
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
+import opennlp.tools.formats.nkjp.NKJPSentenceSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesPOSSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStreamFactory;
@@ -128,6 +129,7 @@
     IrishSentenceBankSentenceStreamFactory.registerFactory();
     IrishSentenceBankTokenSampleStreamFactory.registerFactory();
     LeipzigLanguageSampleStreamFactory.registerFactory();
+    NKJPSentenceSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
new file mode 100644
index 000000000..b532bd941
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.util.Span;
+import opennlp.tools.util.XmlUtil;
+
+public class NKJPSegmentationDocument {
+
+  public static class Pointer {
+    String doc;
+    String id;
+    int offset;
+    int length;
+    boolean space_after;
+
+    public Pointer(String doc, String id, int offset, int length, boolean space_after) {
+      this.doc = doc;
+      this.id = id;
+      this.offset = offset;
+      this.length = length;
+      this.space_after = space_after;
+    }
+
+    public Span toSpan() {
+      return new Span(this.offset, this.offset + this.length);
+    }
+
+    @Override
+    public String toString() {
+      return doc + "#string-range(" + id + "," + Integer.toString(offset)
+          + "," + Integer.toString(length) + ")";
+    }
+  }
+
+  public Map<String, Map<String, Pointer>> getSegments() {
+    return segments;
+  }
+
+  Map<String, Map<String, Pointer>> segments;
+
+  NKJPSegmentationDocument() {
+    this.segments = new LinkedHashMap<>();
+  }
+
+  NKJPSegmentationDocument(Map<String, Map<String, Pointer>> segments) {
+    this();
+    this.segments = segments;
+  }
+
+  public static NKJPSegmentationDocument parse(InputStream is) throws IOException {
+
+    Map<String, Map<String, Pointer>> sentences = new LinkedHashMap<>();
+
+    try {
+      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();;
+      Document doc = docBuilder.parse(is);
+
+      XPathFactory xPathfactory = XPathFactory.newInstance();
+      XPath xpath = xPathfactory.newXPath();
+
+      final XPathExpression SENT_NODES = xpath.compile("/teiCorpus/TEI/text/body/p/s");
+      final XPathExpression SEG_NODES = xpath.compile("./seg|./choice");
+      final XPathExpression SEG_NODES_ONLY = xpath.compile("./seg");
+
+      NodeList nl = (NodeList) SENT_NODES.evaluate(doc, XPathConstants.NODESET);
+
+      for (int i = 0; i < nl.getLength(); i++) {
+        Node sentnode = nl.item(i);
+
+        String sentid = null;
+        if (sentnode.getAttributes().getNamedItem("xml:id") != null) {
+          sentid = sentnode.getAttributes().getNamedItem("xml:id").getTextContent();
+        }
+
+        Map<String, Pointer> segments = new LinkedHashMap<>();
+        NodeList segnl = (NodeList) SEG_NODES.evaluate(sentnode, XPathConstants.NODESET);
+
+        for (int j = 0; j < segnl.getLength(); j++) {
+          Node n = segnl.item(j);
+          if (n.getNodeName().equals("seg")) {
+            String segid = xmlID(n);
+            Pointer pointer = fromSeg(n);
+            segments.put(segid, pointer);
+          } else if (n.getNodeName().equals("choice")) {
+            boolean have_seg = false;
+            if (have_seg) {
+              continue;
+            }
+
+            NodeList choices = n.getChildNodes();
+
+            for (int k = 0; k < choices.getLength(); k++) {
+              if (choices.item(k).getNodeName().equals("nkjp:paren")) {
+                if (!checkRejectedParen(choices.item(k))) {
+                  NodeList paren_segs = (NodeList) SEG_NODES_ONLY.evaluate(choices.item(k),
+                      XPathConstants.NODESET);
+                  have_seg = true;
+
+                  for (int l = 0; l < paren_segs.getLength(); l++) {
+                    String segid = xmlID(paren_segs.item(l));
+                    Pointer pointer = fromSeg(paren_segs.item(l));
+                    segments.put(segid, pointer);
+                  }
+                }
+              } else if (choices.item(k).getNodeName().equals("seg")) {
+                if (!checkRejected(choices.item(k))) {
+                  have_seg = true;
+                  String segid = xmlID(choices.item(k));
+                  Pointer pointer = fromSeg(choices.item(k));
+                  segments.put(segid, pointer);
+                }
+              }
+            }
+          }
+        }
+
+        sentences.put(sentid, segments);
+      }
+
+    } catch (SAXException | XPathExpressionException | IOException e) {
+      throw new IOException("Failed to parse NKJP document", e);
+    }
+
+    return new NKJPSegmentationDocument(sentences);
+  }
+
+  static boolean checkRejected(Node n) {
+    if (n.getAttributes() == null) {
+      return false;
+    }
+    if (n.getAttributes().getNamedItem("nkjp:rejected") == null) {
+      return false;
+    }
+    String rejected = n.getAttributes().getNamedItem("nkjp:rejected").getTextContent();
+    return rejected.equals("true");
+  }
+
+  static boolean checkRejectedParen(Node n) {
+    if (n.getChildNodes().getLength() == 0) {
+      return false;
+    } else {
+      for (int i = 0; i < n.getChildNodes().getLength(); i++) {
+        Node m = n.getChildNodes().item(i);
+        if (m.getNodeName().equals("seg")) {
+          if (!checkRejected(m)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+  }
+
+  static String xmlID(Node n) throws IOException {
+    if (n.getAttributes() == null || n.getAttributes().getLength() < 1) {
+      throw new IOException("Missing required attributes");
+    }
+
+    String id = null;
+    if (n.getAttributes().getNamedItem("xml:id") != null) {
+      id = n.getAttributes().getNamedItem("xml:id").getTextContent();
+    }
+
+    if (id == null) {
+      throw new IOException("Missing xml:id attribute");
+    }
+
+    return id;
+  }
+
+  static Pointer fromSeg(Node n) throws IOException {
+    if (n.getAttributes() == null || n.getAttributes().getLength() < 2) {
+      throw new IOException("Missing required attributes");
+    }
+
+    String ptr = null;
+    if (n.getAttributes().getNamedItem("corresp") != null) {
+      ptr = n.getAttributes().getNamedItem("corresp").getTextContent();
+    }
+    String spacing = "";
+    if (n.getAttributes().getNamedItem("nkjp:nps") != null) {
+      spacing = n.getAttributes().getNamedItem("nkjp:nps").getTextContent();
+    }
+
+    if (ptr == null) {
+      throw new IOException("Missing required attribute");
+    }
+
+    boolean space_after = (ptr.equals("yes"));
+
+    if (!ptr.contains("#") || !ptr.contains("(") || ptr.charAt(ptr.length() - 1) != ')') {
+      throw new IOException("String " + ptr + " does not appear to be a valid NKJP corresp attribute");
+    }
+
+    int docend = ptr.indexOf('#');
+    String document = ptr.substring(0, docend);
+
+    int pointer_start = ptr.indexOf('(') + 1;
+    String[] pieces = ptr.substring(pointer_start, ptr.length() - 1).split(",");
+
+    if (pieces.length < 3 || pieces.length > 4) {
+      throw new IOException("String " + ptr + " does not appear to be a valid NKJP corresp attribute");
+    }
+
+    String docid = pieces[0];
+    int offset = 0;
+    int length = 0;
+    if (pieces.length == 3) {
+      offset = Integer.parseInt(pieces[1]);
+      length = Integer.parseInt(pieces[2]);
+    } else {
+      int os1 = Integer.parseInt(pieces[1]);
+      int os2 = Integer.parseInt(pieces[2]);
+      offset = (os1 * 1000) + os2;
+      length = Integer.parseInt(pieces[3]);
+    }
+
+    return new Pointer(document, docid, offset, length, space_after);
+  }
+
+  static NKJPSegmentationDocument parse(File file) throws IOException {
+    try (InputStream in = new FileInputStream(file)) {
+      return parse(in);
+    }
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java
new file mode 100644
index 000000000..5f6c00118
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStream.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class NKJPSentenceSampleStream implements ObjectStream<SentenceSample> {
+  private final NKJPSegmentationDocument segments;
+
+  private final NKJPTextDocument text;
+
+  private Iterator<Map.Entry<String, Map<String, NKJPSegmentationDocument.Pointer>>> segmentIt;
+
+  NKJPSentenceSampleStream(NKJPSegmentationDocument segments, NKJPTextDocument text) {
+    this.segments = segments;
+    this.text = text;
+    reset();
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+    StringBuilder sentencesString = new StringBuilder();
+    List<Span> sentenceSpans = new LinkedList<>();
+    Map<String, String> paragraphs = text.getParagraphs();
+
+    while (segmentIt.hasNext()) {
+      Map.Entry<String, Map<String, NKJPSegmentationDocument.Pointer>> segment = segmentIt.next();
+      int start = 0;
+      int end = 0;
+      boolean started = false;
+      String lastParagraphId = "";
+      String currentParagraph = "";
+
+      for (String s : segment.getValue().keySet()) {
+        NKJPSegmentationDocument.Pointer currentPointer = segment.getValue().get(s);
+        Span currentSpan = currentPointer.toSpan();
+
+        if (!started) {
+          start = currentSpan.getStart();
+          started = true;
+          lastParagraphId = currentPointer.id;
+          currentParagraph = paragraphs.get(currentPointer.id);
+        }
+
+        if (!lastParagraphId.equals(currentPointer.id)) {
+          int new_start = sentencesString.length();
+          sentencesString.append(currentParagraph.substring(start, end));
+          int new_end = sentencesString.length();
+          sentenceSpans.add(new Span(new_start, new_end));
+          sentencesString.append(' ');
+
+          start = currentSpan.getStart();
+          end = currentSpan.getEnd();
+          lastParagraphId = currentPointer.id;
+          currentParagraph = paragraphs.get(currentPointer.id);
+        } else {
+          end = currentSpan.getEnd();
+        }
+      }
+
+      int new_start = sentencesString.length();
+      sentencesString.append(currentParagraph.substring(start, end));
+      int new_end = sentencesString.length();
+      sentenceSpans.add(new Span(new_start, new_end));
+      sentencesString.append(' ');
+    }
+
+    // end of stream is reached, indicate that with null return value
+    if (sentenceSpans.size() == 0) {
+      return null;
+    }
+
+    return new SentenceSample(sentencesString.toString(),
+      sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+  }
+
+  @Override
+  public void reset() {
+    segmentIt = segments.getSegments().entrySet().iterator();
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java
new file mode 100644
index 000000000..9a944588d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSentenceSampleStreamFactory.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class NKJPSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "text",
+        description = "file containing NKJP text")
+    File getTextFile();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        "nkjp", new NKJPSentenceSampleStreamFactory(
+        NKJPSentenceSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> NKJPSentenceSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+
+    CmdLineUtil.checkInputFile("Text", params.getTextFile());
+
+    NKJPSegmentationDocument segDoc = null;
+    NKJPTextDocument textDoc = null;
+    try {
+      segDoc = NKJPSegmentationDocument.parse(params.getData());
+      textDoc = NKJPTextDocument.parse(params.getTextFile());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new NKJPSentenceSampleStream(segDoc, textDoc);
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
new file mode 100644
index 000000000..53421f4d3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.util.XmlUtil;
+
+public class NKJPTextDocument {
+
+  Map<String, String> divtypes;
+
+  Map<String, Map<String, Map<String, String>>> texts;
+
+  NKJPTextDocument() {
+    divtypes = new HashMap<>();
+    texts = new HashMap<>();
+  }
+
+  NKJPTextDocument(Map<String, String> divtypes, Map<String, Map<String, Map<String, String>>> texts) {
+    this();
+    this.divtypes = divtypes;
+    this.texts = texts;
+  }
+
+  public static NKJPTextDocument parse(InputStream is) throws IOException {
+    Map<String, String> divtypes = new HashMap<>();
+    Map<String, Map<String, Map<String, String>>> texts = new HashMap<>();
+
+    try {
+      DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();;
+      Document doc = docBuilder.parse(is);
+
+      XPathFactory xPathfactory = XPathFactory.newInstance();
+      XPath xpath = xPathfactory.newXPath();
+
+      final XPathExpression TEXT_NODES_EXAMPLE = xpath.compile("/teiCorpus/TEI/text/group/text");
+      final XPathExpression TEXT_NODES_SAMPLE = xpath.compile("/teiCorpus/TEI/text");
+      final XPathExpression DIV_NODES = xpath.compile("./body/div");
+      final XPathExpression PARA_NODES = xpath.compile("./p|./ab");
+
+      doc.getDocumentElement().normalize();
+      String root = doc.getDocumentElement().getNodeName();
+
+      if (!root.equalsIgnoreCase("teiCorpus")) {
+        throw new IOException("Expected root node " + root);
+      }
+
+      String current_text = "";
+      NodeList textnl = (NodeList) TEXT_NODES_EXAMPLE.evaluate(doc, XPathConstants.NODESET);
+      if (textnl.getLength() == 0) {
+        textnl = (NodeList) TEXT_NODES_SAMPLE.evaluate(doc, XPathConstants.NODESET);
+      }
+
+      for (int i = 0; i < textnl.getLength(); i++) {
+        Node textnode = textnl.item(i);
+        current_text = attrib(textnode, "xml:id", true);
+
+        Map<String, Map<String, String>> current_divs = new HashMap<>();
+        NodeList divnl = (NodeList) DIV_NODES.evaluate(textnode, XPathConstants.NODESET);
+        for (int j = 0; j < divnl.getLength(); j++) {
+          Node divnode = divnl.item(j);
+          String divtype = attrib(divnode, "type", false);
+          String divid = attrib(divnode, "xml:id", true);
+          divtypes.put(divid, divtype);
+
+          Map<String, String> current_paras = new HashMap<>();
+          NodeList paranl = (NodeList) PARA_NODES.evaluate(divnode, XPathConstants.NODESET);
+
+          for (int k = 0; k < paranl.getLength(); k++) {
+            Node pnode = paranl.item(k);
+            String pid = attrib(pnode, "xml:id", true);
+
+            if (pnode.getChildNodes().getLength() != 1
+                && !pnode.getFirstChild().getNodeName().equals("#text")) {
+              throw new IOException("Unexpected content in p element " + pid);
+            }
+
+            String ptext = pnode.getTextContent();
+            current_paras.put(pid, ptext);
+          }
+
+          current_divs.put(divid, current_paras);
+        }
+
+        texts.put(current_text, current_divs);
+      }
+
+    } catch (SAXException | XPathExpressionException | IOException e) {
+      throw new IOException("Failed to parse NKJP document", e);
+    }
+    return new NKJPTextDocument(divtypes, texts);
+  }
+
+  static NKJPTextDocument parse(File file) throws IOException {
+    try (InputStream in = new FileInputStream(file)) {
+      return parse(in);
+    }
+  }
+
+  Map<String, String> getDivtypes() {
+    return Collections.unmodifiableMap(this.divtypes);
+  }
+
+  Map<String, Map<String, Map<String, String>>> getTexts() {
+    return Collections.unmodifiableMap(this.texts);
+  }
+
+  /**
+   * Segmentation etc. is done only in relation to the paragraph,
+   * which are unique within a document. This is to simplify
+   * working with the paragraphs within the document
+   * @return a map of paragaph IDs and their text
+   */
+  Map<String, String> getParagraphs() {
+    Map<String, String> paragraphs = new HashMap<>();
+    for (String dockey : texts.keySet()) {
+      for (String divkey : texts.get(dockey).keySet()) {
+        for (String pkey : texts.get(dockey).get(divkey).keySet()) {
+          paragraphs.put(pkey, texts.get(dockey).get(divkey).get(pkey));
+        }
+      }
+    }
+    return paragraphs;
+  }
+
+  /**
+   * Helper method to get the value of an attribute
+   * @param n The node being processed
+   * @param attrib The name of the attribute
+   * @param required Whether or not the attribute is required
+   * @return The value of the attribute, or null if not required and not present
+   * @throws Exception
+   */
+  private static String attrib(Node n, String attrib, boolean required) throws IOException {
+    if (required && (n.getAttributes() == null || n.getAttributes().getLength() == 0)) {
+      throw new IOException("Missing required attributes in node " + n.getNodeName());
+    }
+    if (n.getAttributes().getNamedItem(attrib) != null) {
+      return n.getAttributes().getNamedItem(attrib).getTextContent();
+    } else {
+      if (required) {
+        throw new IOException("Required attribute \"" + attrib + "\" missing in node " + n.getNodeName());
+      } else {
+        return null;
+      }
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocumentTest.java
new file mode 100644
index 000000000..226b92ce5
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocumentTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class NKJPSegmentationDocumentTest {
+  @Test
+  public void testParsingSimpleDoc() throws IOException {
+    try (InputStream nkjpSegXmlIn =
+           NKJPSegmentationDocumentTest.class.getResourceAsStream("ann_segmentation.xml")) {
+
+      NKJPSegmentationDocument doc = NKJPSegmentationDocument.parse(nkjpSegXmlIn);
+
+      assertEquals(1, doc.getSegments().size());
+
+      assertEquals(7, doc.getSegments().get("segm_1.1-s").size());
+
+      String src = "To krótkie zdanie w drugim akapicie.";
+
+      int offset = doc.getSegments().get("segm_1.1-s").get("segm_1.1-seg").offset;
+      assertEquals(0, offset);
+      int length = doc.getSegments().get("segm_1.1-s").get("segm_1.1-seg").length;
+      assertEquals(2, length);
+      assertEquals("To", src.substring(offset, length));
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPTextDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPTextDocumentTest.java
new file mode 100644
index 000000000..760af8973
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/nkjp/NKJPTextDocumentTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.nkjp;
+
+import java.io.InputStream;
+import java.util.Map;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class NKJPTextDocumentTest {
+  @Test
+  public void testParsingSimpleDoc() throws Exception {
+    try (InputStream nkjpTextXmlIn =
+           NKJPTextDocumentTest.class.getResourceAsStream("text_structure.xml")) {
+
+      NKJPTextDocument doc = NKJPTextDocument.parse(nkjpTextXmlIn);
+
+      assertEquals(1, doc.getDivtypes().size());
+      assertEquals("article", doc.getDivtypes().get("div-1"));
+
+      assertEquals(1, doc.getTexts().size());
+      assertEquals(1, doc.getTexts().get("text-1").size());
+      assertEquals(2, doc.getTexts().get("text-1").get("div-1").size());
+
+      String exp = "To krótki tekst w formacie NKJP. Zawiera dwa zdania.";
+      assertEquals(exp, doc.getTexts().get("text-1").get("div-1").get("p-1"));
+    }
+  }
+
+  @Test
+  public void testGetParagraphs() throws Exception {
+    try (InputStream nkjpTextXmlIn =
+           NKJPTextDocumentTest.class.getResourceAsStream("text_structure.xml")) {
+
+      NKJPTextDocument doc = NKJPTextDocument.parse(nkjpTextXmlIn);
+      Map<String, String> paras = doc.getParagraphs();
+      assertEquals("To krótkie zdanie w drugim akapicie.", paras.get("ab-1"));
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/ann_segmentation.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/ann_segmentation.xml
new file mode 100644
index 000000000..15cde1c69
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/ann_segmentation.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<teiCorpus xmlns:xi="http://www.w3.org/2001/XInclude" xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
+    <TEI>
+        <text xml:id="segm_text" xml:lang="pl">
+            <body xml:id="segm_body">
+                <p corresp="text.xml#txt_1-div" xml:id="segm_1-p">
+                    <s xml:id="segm_1.1-s">
+                        <seg corresp="text_structure.xml#string-range(ab-1,0,2)" xml:id="segm_1.1-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,3,7)" xml:id="segm_1.2-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,11,6)" xml:id="segm_1.3-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,18,1)" xml:id="segm_1.4-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,20,6)" xml:id="segm_1.5-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,27,8)" xml:id="segm_1.6-seg"/>
+                        <seg corresp="text_structure.xml#string-range(ab-1,35,1)" nkjp:nps="true" xml:id="segm_1.7-seg"/>
+                    </s>
+                </p>
+            </body>
+        </text>
+    </TEI>
+</teiCorpus>
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/text_structure.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/text_structure.xml
new file mode 100644
index 000000000..61a4ce1fd
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/nkjp/text_structure.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?oxygen RNGSchema="NKJP_structure.rng" type="xml"?>
+
+<teiCorpus xmlns:xi="http://www.w3.org/2001/XInclude" xmlns="http://www.tei-c.org/ns/1.0">
+    <TEI>
+        <text>
+            <group>
+                <text decls="#bibl-1" xml:id="text-1">
+                    <front>
+                        <docTitle>
+                            <titlePart type="main" xml:id="titlePart-1">Krótki tekst</titlePart>
+                        </docTitle>
+                    </front>
+                    <body>
+                        <div type="article" xml:id="div-1">
+                            <p xml:id="p-1">To krótki tekst w formacie NKJP. Zawiera dwa zdania.</p>
+                            <ab xml:id="ab-1">To krótkie zdanie w drugim akapicie.</ab>
+                        </div>
+                    </body>
+                </text>
+            </group>
+        </text>
+    </TEI>
+</teiCorpus>


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Sentence detector format support for NKJP
> -----------------------------------------
>
>                 Key: OPENNLP-1130
>                 URL: https://issues.apache.org/jira/browse/OPENNLP-1130
>             Project: OpenNLP
>          Issue Type: New Feature
>          Components: Formats
>            Reporter: Jim Regan
>            Priority: Minor
>




--
This message was sent by Atlassian JIRA
(v6.4.14#64029)