You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/03/28 19:45:53 UTC
[opennlp] branch master updated: OPENNLP-565 Support for the MASC format (#364)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 4b69d7c OPENNLP-565 Support for the MASC format (#364)
4b69d7c is described below
commit 4b69d7c82b5d8cf4a67aca4d061745a95b510ada
Author: Jiri Zamecnik <24...@users.noreply.github.com>
AuthorDate: Mon Mar 28 21:45:48 2022 +0200
OPENNLP-565 Support for the MASC format (#364)
* Support for the MASC format
---
.../tools/cmdline/StreamFactoryRegistry.java | 9 +
.../opennlp/tools/formats/masc/MascDocument.java | 444 +++++++++++++++++++++
.../tools/formats/masc/MascDocumentStream.java | 235 +++++++++++
.../tools/formats/masc/MascNamedEntityParser.java | 101 +++++
.../formats/masc/MascNamedEntitySampleStream.java | 101 +++++
.../masc/MascNamedEntitySampleStreamFactory.java | 74 ++++
.../tools/formats/masc/MascPOSSampleStream.java | 92 +++++
.../formats/masc/MascPOSSampleStreamFactory.java | 76 ++++
.../tools/formats/masc/MascPennTagParser.java | 112 ++++++
.../opennlp/tools/formats/masc/MascSentence.java | 341 ++++++++++++++++
.../tools/formats/masc/MascSentenceParser.java | 64 +++
.../formats/masc/MascSentenceSampleStream.java | 94 +++++
.../masc/MascSentenceSampleStreamFactory.java | 81 ++++
.../java/opennlp/tools/formats/masc/MascToken.java | 83 ++++
.../tools/formats/masc/MascTokenSampleStream.java | 112 ++++++
.../formats/masc/MascTokenSampleStreamFactory.java | 82 ++++
.../java/opennlp/tools/formats/masc/MascWord.java | 42 ++
.../opennlp/tools/formats/masc/MascWordParser.java | 63 +++
.../masc/MascNamedEntitySampleStreamTest.java | 167 ++++++++
.../formats/masc/MascPOSSampleStreamTest.java | 155 +++++++
.../formats/masc/MascSentenceSampleStreamTest.java | 165 ++++++++
.../formats/masc/MascTokenSampleStreamTest.java | 175 ++++++++
.../opennlp/tools/formats/masc/fakeMASC-ne.xml | 20 +
.../opennlp/tools/formats/masc/fakeMASC-penn.xml | 145 +++++++
.../opennlp/tools/formats/masc/fakeMASC-s.xml | 29 ++
.../opennlp/tools/formats/masc/fakeMASC-seg.xml | 20 +
.../opennlp/tools/formats/masc/fakeMASC.hdr | 37 ++
.../opennlp/tools/formats/masc/fakeMASC.txt | 1 +
28 files changed, 3120 insertions(+)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index db95a4f..215c80b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -59,6 +59,10 @@ import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFa
import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
+import opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory;
+import opennlp.tools.formats.masc.MascPOSSampleStreamFactory;
+import opennlp.tools.formats.masc.MascSentenceSampleStreamFactory;
+import opennlp.tools.formats.masc.MascTokenSampleStreamFactory;
import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
import opennlp.tools.formats.nkjp.NKJPSentenceSampleStreamFactory;
@@ -130,6 +134,11 @@ public final class StreamFactoryRegistry {
IrishSentenceBankTokenSampleStreamFactory.registerFactory();
LeipzigLanguageSampleStreamFactory.registerFactory();
NKJPSentenceSampleStreamFactory.registerFactory();
+
+ MascNamedEntitySampleStreamFactory.registerFactory();
+ MascPOSSampleStreamFactory.registerFactory();
+ MascSentenceSampleStreamFactory.registerFactory();
+ MascTokenSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java
new file mode 100644
index 0000000..0e7af1a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java
@@ -0,0 +1,444 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.SAXParser;
+
+import org.xml.sax.SAXException;
+
+import opennlp.tools.util.Span;
+import opennlp.tools.util.XmlUtil;
+
+
+public class MascDocument {
+
+ private final List<MascSentence> sentences;
+ private final String pathToFile;
+ private Iterator<MascSentence> sentenceIterator;
+ private boolean hasPennTags = false;
+ private boolean hasNamedEntities = false;
+
+ public MascDocument(String path, List<MascSentence> sentences) {
+ this.pathToFile = path;
+ this.sentences = sentences;
+ this.sentenceIterator = sentences.iterator();
+ }
+
+ /**
+ * Creates a MASC document with all of the stand-off annotations translated into the internal
+ * structure.
+ *
+ * @param path The path where the document header is.
+ * @param f_primary The file with the raw corpus text.
+ * @param f_seg The file with segmentation into quarks.
+ * @param f_ne The file with named entities.
+ * @param f_penn The file with tokenization and Penn POS tags produced
+ * by GATE-5.0 ANNIE application.
+ * @param f_s The file with sentence boundaries.
+ * @return A document containing the text and its annotations. Immutability is not guaranteed yet.
+ * @throws IOException if the raw data cannot be read or the alignment of the raw data
+ * with annotations fails
+ */
+ public static MascDocument parseDocument(String path, InputStream f_primary, InputStream f_seg,
+ InputStream f_penn, InputStream f_s, InputStream f_ne)
+ throws IOException {
+
+ String text = readText(f_primary);
+ List<MascWord> words = parseWords(f_seg);
+ List<Span> sentenceSpans = parseSentences(f_s);
+
+ List<MascSentence> sentences = combineAnnotations(text, sentenceSpans, words);
+ MascDocument doc = new MascDocument(path, sentences);
+
+ // if the file has Penn POS tags, add them
+ if (f_penn != null) {
+ doc.addPennTags(parsePennTags(f_penn));
+ }
+
+ if (f_ne != null) {
+ doc.addNamedEntityTags(parseNamedEntity(f_ne));
+ }
+
+ //todo: make the annotations immutable
+ //todo: should we cleanup the document (e.g. remove sentences without tokens?)
+ return doc;
+ }
+
+ /**
+ * Read in the corpus file text
+ *
+ * @param stream The corpus file
+ * @return The text of the file
+ * @throws IOException if anything goes wrong
+ */
+ private static String readText(InputStream stream) throws IOException {
+ try {
+ Reader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+ StringBuilder contents = new StringBuilder();
+ char[] buffer = new char[8192];
+ int read;
+ while ((read = reader.read(buffer, 0, buffer.length)) > 0) {
+ contents.append(buffer, 0, read);
+ }
+ return contents.toString();
+ } finally {
+ // this may throw an exception
+ stream.close();
+ }
+ }
+
+
+ /**
+ * Parses the word segmentation stand-off annotation
+ *
+ * @param f_seg The file with segmentation
+ * @return A list of individual quarks, expressed as MascWord-s
+ * @throws IOException if anything goes wrong
+ */
+ private static List<MascWord> parseWords(InputStream f_seg) throws IOException {
+
+ try {
+ SAXParser saxParser = XmlUtil.createSaxParser();
+ MascWordParser handler = new MascWordParser();
+ try {
+ saxParser.parse(f_seg, handler);
+ } catch (SAXException e) {
+ throw new IOException("Could not parse the region annotation file");
+ }
+
+ return Collections.unmodifiableList(handler.getAnchors());
+
+ } finally {
+ f_seg.close();
+ }
+ }
+
+ /**
+ * Parse the sentence annotation file, align it with the raw text
+ *
+ * @param f_s the sentence annotation file
+ * @return the list of Spans delimiting each sentence
+ * @throws IOException if the sentence file cannot be parsed or closed
+ */
+ private static List<Span> parseSentences(InputStream f_s) throws IOException {
+
+ try {
+ SAXParser saxParser = XmlUtil.createSaxParser();
+ MascSentenceParser handler = new MascSentenceParser();
+ try {
+ saxParser.parse(f_s, handler);
+ } catch (SAXException e) {
+ throw new IOException("Could not parse the sentence annotation file");
+ }
+
+ List<Span> anchors = handler.getAnchors();
+
+ /*Filter out sentence overlaps.
+ Keep only those sentences where sentence.end < nextsentence.beginning
+ avoid deleting in the middle and repeatedly shifting the list by copying into a new list*/
+ //todo: can we know a priori, if we need this filtering?
+ List<Span> filteredAnchors = new ArrayList<>();
+ for (int i = 0; i < anchors.size() - 1; i++) {
+ if (anchors.get(i).getEnd() < anchors.get(i + 1).getStart()) {
+ filteredAnchors.add(anchors.get(i));
+ }
+ }
+ filteredAnchors.add(anchors.get(anchors.size() - 1));
+
+ return Collections.unmodifiableList(filteredAnchors);
+
+ } finally {
+ f_s.close();
+ }
+
+ }
+
+ /**
+ * Parses the Penn-POS (GATE5-ANNIE) stand-off annotation
+ *
+ * @param f_penn The file with Penn POS tags
+ * @return A map of three sub-maps: tokenToTag, from Penn token ID (int) to Penn POS-tag,
+ * tokenToBase, from Penn token ID (int) to the base and tokenToQuarks, from Penn token ID
+ * (int) to a List of quark IDs contained in that token.
+ * @throws IOException if anything goes wrong
+ */
+ private static Map<String, Map> parsePennTags(InputStream f_penn) throws IOException {
+ Map<String, Map> tagsAndBases = new HashMap<>();
+
+ try {
+ SAXParser saxParser = XmlUtil.createSaxParser();
+ MascPennTagParser handler = new MascPennTagParser();
+ try {
+ saxParser.parse(f_penn, handler);
+ } catch (SAXException e) {
+ throw new IOException("Could not parse the Penn tag annotation file");
+ }
+
+ tagsAndBases.put("tokenToTag", handler.getTags());
+ tagsAndBases.put("tokenToBase", handler.getBases());
+ tagsAndBases.put("tokenToQuarks", handler.getTokenToQuarks());
+
+ return tagsAndBases;
+
+ } finally {
+ f_penn.close();
+ }
+ }
+
+ /**
+ * Parses the named entity stand-off annotation
+ *
+ * @param f_ne The file with named entity annotations
+ * @return A map with two sub-maps, entityIDtoEntityType, mapping entity ID integers
+ * to entity type Strings, and entityIDsToTokens, mapping entity ID integers to Penn
+ * token ID integers
+ * @throws IOException if anything goes wrong
+ */
+ private static Map<String, Map> parseNamedEntity(InputStream f_ne) throws IOException {
+
+ try {
+ SAXParser saxParser = XmlUtil.createSaxParser();
+ MascNamedEntityParser handler = new MascNamedEntityParser();
+ try {
+ saxParser.parse(f_ne, handler);
+ } catch (SAXException e) {
+ System.out.println(e.getMessage());
+ throw new IOException("Could not parse the named entity annotation file");
+ }
+
+ Map<Integer, String> entityIDtoEntityType = handler.getEntityIDtoEntityType();
+ Map<Integer, List<Integer>> entityIDsToTokens = handler.getEntityIDsToTokens();
+ Map<String, Map> results = new HashMap<>();
+ results.put("entityIDtoEntityType", entityIDtoEntityType);
+ results.put("entityIDsToTokens", entityIDsToTokens);
+ return results;
+
+ } finally {
+ f_ne.close();
+ }
+ }
+
+ /**
+ * Combines the raw text with annotations that every file should have.
+ *
+ * @param text The raw text.
+ * @param sentenceSpans The spans definining individual sentences. Overlaps are not permitted.
+ * @param words The quarks of the raw text.
+ * @return A list of sentences, each of which is a list of quarks. Some quarks may belong to
+ * more than one sentence. Quarks which do not belong to a single sentence are silently dropped.
+ * @throws IOException If sentences and quarks cannot be aligned.
+ */
+ private static List<MascSentence> combineAnnotations(String text,
+ List<Span> sentenceSpans,
+ List<MascWord> words) throws IOException {
+
+ int wordIndex = 0;
+ int wordCount = words.size();
+ List<MascSentence> sentences = new ArrayList<>();
+ for (Span s : sentenceSpans) {
+ if (s.getEnd() - s.getStart() > 0) {
+ List<MascWord> quarks = new ArrayList<>();
+ int sentenceStart = s.getStart();
+ int sentenceEnd = s.getEnd();
+
+ //todo: is it okay that quarks can cross sentence boundary? What are the implications?
+ /*
+ Allow quarks to cross sentence boundary.
+ The decisive factor determining if a quark belongs to a sentence is if they overlap.
+ I.e. sent.getEnd() > quark.getStart() && sent.getStart() < quark.getEnd()
+ */
+ MascWord nextWord = words.get(wordIndex);
+ //Find sentence beginning, should not be needed unless overlaps occur
+ while (sentenceStart < nextWord.getEnd() && wordIndex > 0) {
+ wordIndex--;
+ nextWord = words.get(wordIndex);
+ }
+
+ //todo: can this be translated into Span's methods .crosses()/.contains()?
+ //find all quarks contained or crossing the span of that sentence
+ boolean sentenceOver = false;
+ while ((!sentenceOver) && wordIndex < wordCount) {
+ nextWord = words.get(wordIndex);
+ int nextWordStart = nextWord.getStart();
+ int nextWordEnd = nextWord.getEnd();
+ // word either ends or starts or ends & starts in the middle of sentence
+ if (sentenceEnd > nextWordStart && sentenceStart < nextWordEnd) {
+ quarks.add(nextWord);
+ if (sentenceEnd == nextWordEnd) {
+ sentenceOver = true;
+ }
+ wordIndex++;
+ } else if (sentenceEnd <= nextWordStart) {
+ sentenceOver = true;
+ } else {
+ wordIndex++;
+ }
+ }
+
+ // If we are at the end of words, but not in the last sentence, throw an error
+ if (!sentenceOver && sentences.size() != sentenceSpans.size() - 1) {
+ throw new IOException("Sentence ends and word ends do not match." +
+ "First sentence not completed ends at character: " + sentenceEnd);
+ }
+
+ MascSentence sentence = new MascSentence(sentenceStart, sentenceEnd, text, quarks,
+ words);
+ sentences.add(sentence);
+ }
+ }
+
+ return Collections.unmodifiableList(sentences);
+
+ }
+
+
+ /**
+ * Attach the named entity labels to individual tokens
+ *
+ * @param namedEntities A map with two sub-maps, entityIDtoEntityType, mapping entity ID integers
+ * * to entity type Strings, and entityIDsToTokens, mapping entity ID integers to Penn
+ * * token ID integers
+ */
+ private void addNamedEntityTags(Map<String, Map> namedEntities) {
+ try {
+ Map<Integer, String> entityIDtoEntityType = namedEntities.get("entityIDtoEntityType");
+ Map<Integer, List<Integer>> entityIDsToTokens = namedEntities.get("entityIDsToTokens");
+
+ for (MascSentence s : sentences) {
+ boolean success = s.addNamedEntities(entityIDtoEntityType, entityIDsToTokens);
+ if (!success) {
+ System.out.println("\tIssues occurred in the file: " + pathToFile);
+ }
+ }
+ hasNamedEntities = true;
+ } catch (IOException e) {
+ System.err.println("[ERROR] Failed connecting tokens and named entities.");
+ System.err.println("\tThe error occurred in the file: " + pathToFile);
+ System.err.println(e.getMessage());
+ System.err.println(Arrays.toString(e.getStackTrace()));
+ }
+ }
+
+
+ /**
+ * Attach tags and bases to MascWords in each of the sentences.
+ *
+ * @param tagMaps A map of three sub-maps: tokenToTag, from Penn token ID (int) to Penn POS-tag,
+ * * tokenToBase, from Penn token ID (int) to the base and tokenToQuarks, from Penn token ID
+ * * (int) to a List of quark IDs contained in that token.
+ */
+ private void addPennTags(Map<String, Map> tagMaps) throws IOException {
+ try {
+ // Extract individual mappings
+ Map<Integer, String> tokenToTag = tagMaps.get("tokenToTag");
+ Map<Integer, String> tokenToBase = tagMaps.get("tokenToBase");
+ Map<Integer, int[]> tokenToQuarks = tagMaps.get("tokenToQuarks");
+
+ //Check that all tokens have at least one quark.
+ for (Map.Entry<Integer, int[]> token : tokenToQuarks.entrySet()) {
+ if (token.getValue().length == 0) {
+ System.err.println("[ERROR] Token without quarks: " + token.getKey());
+ }
+ }
+
+ Map<Integer, int[]> quarkToTokens = new HashMap<>();
+ for (Map.Entry<Integer, int[]> tokenAndQuarks : tokenToQuarks.entrySet()) {
+ int token = tokenAndQuarks.getKey();
+ int[] quarks = tokenAndQuarks.getValue();
+ for (int quark : quarks) {
+ //very rarely, one quark may belong to several token
+ //this is probably a mistake in the corpus annotation
+ if (quarkToTokens.containsKey(quark)) {
+ int[] tokens = quarkToTokens.get(quark);
+ int[] newTokens = new int[tokens.length + 1];
+ newTokens[0] = token;
+ System.arraycopy(tokens, 0, newTokens, 1, tokens.length);
+ System.out.println("[WARNING] One quark belongs to several tokens. f-seg ID: " +
+ quark);
+ System.out.println("\tThe error occurred in file: " + pathToFile);
+ quarkToTokens.put(quark, newTokens);
+ } else {
+ quarkToTokens.put(quark, new int[] {token});
+ }
+ }
+ }
+
+ for (MascSentence s : sentences) {
+ boolean success = s.tokenizePenn(tokenToQuarks, quarkToTokens, tokenToBase, tokenToTag);
+ if (!success) {
+ System.out.println("\tIssue occurred in file: " + pathToFile);
+ }
+ }
+
+ hasPennTags = true;
+
+ } catch (Exception e) {
+ throw new IOException("Could not attach POS tags to words. " +
+ e.getMessage() + Arrays.toString(e.getStackTrace()));
+ }
+ }
+
+
+ /**
+ * Check whether there is Penn tagging produced by GATE-5.0 ANNIE
+ *
+ * @return true if this file has aligned tags/tokens
+ */
+ public boolean hasPennTags() {
+ return hasPennTags;
+ }
+
+ public boolean hasNamedEntities() {
+ return hasNamedEntities;
+ }
+
+ /**
+ * Get next sentence.
+ *
+ * @return Next sentence or null if end of document reached.
+ */
+ public MascSentence read() {
+ MascSentence next = null;
+ if (sentenceIterator.hasNext()) {
+ next = sentenceIterator.next();
+ }
+ return next;
+ }
+
+ /**
+ * Return the reading of sentences to the beginning of the document.
+ */
+ public void reset() {
+ this.sentenceIterator = this.sentences.iterator();
+ }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java
new file mode 100644
index 0000000..4dffcf4
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import javax.xml.parsers.SAXParser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.XmlUtil;
+
+public class MascDocumentStream implements ObjectStream<MascDocument> {
+
+ /**
+ * A helper class to parse the header (.hdr) files.
+ */
+ private class HeaderHandler extends DefaultHandler {
+ private HashMap<String, String> annotationFiles = null;
+ private String file = null;
+ private String fType = null;
+
+ protected HashMap<String, String> getPathList() {
+ return annotationFiles;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ // create a new annotation file and put it in map
+ // initialize File object and set path attribute
+ if (qName.equalsIgnoreCase("annotation") ||
+ qName.equalsIgnoreCase("primaryData")) {
+ file = attributes.getValue("loc");
+ fType = attributes.getValue("f.id");
+
+ // initialize list
+ if (annotationFiles == null) {
+ annotationFiles = new HashMap<>();
+ }
+ }
+
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ // add annotation object to list
+ if (qName.equalsIgnoreCase("annotation") ||
+ qName.equalsIgnoreCase("primaryData")) {
+ annotationFiles.put(fType, file);
+ }
+
+ }
+
+ }
+ private List<MascDocument> documents = new LinkedList<>();
+ private Iterator<MascDocument> documentIterator;
+ private SAXParser saxParser;
+
+ public MascDocumentStream(File mascCorpusDirectory) throws IOException {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("");
+ new MascDocumentStream(mascCorpusDirectory, true, fileFilter);
+ }
+
+ /**
+ * Creates a MascDocumentStream to read the documents from a given directory.
+ * Works iff all annotation files mentioned in the headers are present.
+ *
+ * @param mascCorpusDirectory the directory containing all the MASC files
+ * @param searchRecursive whether the search should go through subdirectories
+ * @param fileFilter a custom file filter to filter out some files or
+ * null to accept anything
+ * @throws IOException if any stage of the stream creation fails
+ */
+ public MascDocumentStream(File mascCorpusDirectory,
+ boolean searchRecursive, FileFilter fileFilter) throws IOException {
+
+ saxParser = XmlUtil.createSaxParser();
+
+ if (!mascCorpusDirectory.isDirectory()) {
+ throw new IOException("Input corpus directory must be a directory " +
+ "according to File.isDirectory()!");
+ }
+
+ int failedLoads = 0;
+ Stack<File> directoryStack = new Stack<>();
+ directoryStack.add(mascCorpusDirectory);
+
+ while (!directoryStack.isEmpty()) {
+ for (File file : directoryStack.pop().listFiles(fileFilter)) {
+ if (file.isFile()) {
+ String hdrFilePath = file.getAbsolutePath();
+
+ // look for the header files
+ if (hdrFilePath.endsWith(".hdr")) {
+
+ HashMap<String, File> fileGroup = checkAnnotations(hdrFilePath);
+ InputStream f_primary = new BufferedInputStream(
+ new FileInputStream(fileGroup.get("f.text")));
+ InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
+ new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
+ InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
+ new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
+ InputStream f_s = (fileGroup.containsKey("f.s")) ?
+ new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
+ InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
+ new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;
+
+ try {
+ documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
+ f_penn, f_s, f_ne));
+ } catch (IOException e) {
+ System.err.println("Failed to parse the file: " + hdrFilePath);
+ System.err.println('\t' + e.getMessage());
+ failedLoads++;
+ }
+ }
+
+ } else if (searchRecursive && file.isDirectory()) {
+ directoryStack.push(file);
+ }
+ }
+ }
+
+ System.out.println("Documents loaded: " + documents.size());
+ if (failedLoads > 0) {
+ System.err.println("Failed loading " + failedLoads + " documents.");
+ }
+ reset();
+
+ }
+
+ /**
+ * Check that all annotation files mentioned in the header are present
+ *
+ * @param path The path to header
+ * @throws IOException If corpus integrity is violated
+ */
+ private HashMap<String, File> checkAnnotations(String path) throws IOException {
+ HeaderHandler handler = new HeaderHandler();
+ HashMap<String, File> fileGroup = new HashMap<>();
+ File hdrFile = new File(path);
+ try {
+ saxParser.parse(hdrFile, handler);
+ } catch (SAXException e) {
+ throw new IOException("Invalid corpus format. " +
+ "Could not parse the header: " + path);
+ }
+ HashMap<String, String> annotationFiles = handler.getPathList();
+
+ String pathToFolder = hdrFile.getParentFile().getAbsolutePath();
+ for (Map.Entry<String, String> annotation : annotationFiles.entrySet()) {
+ File file = new File(pathToFolder, annotation.getValue());
+ if (!(file.isFile() && file.exists())) {
+ throw new IOException("Corpus integrity violated. " +
+ "Annotation file " + file.getAbsolutePath() + " is missing.");
+ }
+
+ fileGroup.put(annotation.getKey(), file);
+
+ }
+
+ return fileGroup;
+
+ }
+
+ /**
+ * Reset the reading of all documents to the first sentence.
+ * Reset the corpus to the first document.
+ */
+ public void reset() {
+ for (MascDocument doc : documents) {
+ doc.reset();
+ }
+ documentIterator = documents.iterator();
+ }
+
+ /**
+ * Return the next document. Client needs to check if this document has the necessary annotations.
+ *
+ * @return A corpus document with all its annotations.
+ * @throws IOException if anything goes wrong.
+ */
+ public MascDocument read() throws IOException {
+
+ MascDocument doc = null;
+
+ if (documentIterator.hasNext()) {
+ doc = documentIterator.next();
+ }
+
+ return doc;
+ }
+
+ /**
+ * Remove the corpus from the memory.
+ */
+ public void close() {
+ documents = null;
+ documentIterator = null;
+ }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntityParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntityParser.java
new file mode 100644
index 0000000..c1e22de
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntityParser.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A class to process the MASC Named entity stand-off annotation file
+ */
+public class MascNamedEntityParser extends DefaultHandler {
+
+ private Map<Integer, String> entityIDtoEntityType = new HashMap<>();
+ private Map<Integer, List<Integer>> entityIDsToTokens = new HashMap<>();
+ private Map<Integer, String> tokenToEntity = new HashMap<>();
+
+ public Map<Integer, String> getEntityIDtoEntityType() {
+ return entityIDtoEntityType;
+ }
+
+ public Map<Integer, List<Integer>> getEntityIDsToTokens() {
+ return entityIDsToTokens;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ try {
+ if (qName.equals("a")) {
+ int entityID = Integer.parseInt(
+ attributes.getValue("ref").replaceFirst("ne-n", ""));
+ String label = attributes.getValue("label");
+ if (entityIDtoEntityType.containsKey(entityID)) {
+ throw new SAXException("Multiple labels for one named entity");
+ } else {
+ entityIDtoEntityType.put(entityID, label);
+ }
+ }
+
+ if (qName.equals("edge")) {
+ int entityID = Integer.parseInt(
+ attributes.getValue("from").replaceFirst("ne-n", ""));
+ int tokenID = Integer.parseInt(
+ attributes.getValue("to").replaceFirst("penn-n", ""));
+
+ if (!entityIDsToTokens.containsKey(entityID)) {
+ List<Integer> tokens = new ArrayList<>();
+ tokens.add(tokenID);
+ entityIDsToTokens.put(entityID, tokens);
+ } else {
+ entityIDsToTokens.get(entityID).add(tokenID);
+ }
+
+/* Not sure what to do with this. There might be multiple entity links to one token.
+ E.g. Colorado will be one token with the entities "city" and "province".
+ For now, we'll only raise alarm when one TokenID should be assigned
+ to different top-level labels, e.g. person & location (since we are dropping the low-level
+ annotations at the moment). To make this work in OpenNLP (does not allow overlaps), we'll
+ keep only the first named entity type.
+ */
+ //todo: Do we want to give the user control over which types have priority?
+ String type = entityIDtoEntityType.get(entityID);
+ if (tokenToEntity.containsKey(tokenID) && !type.equals(tokenToEntity.get(tokenID))) {
+ System.out.println("[WARNING] One token assigned to different named entity types.\n" +
+ "\tPenn-TokenID: " + tokenID + "\n\tToken types: \"" + type + "\", \"" +
+ tokenToEntity.get(tokenID) + "\"\n\tKeeping only " + "\"type\"");
+ int i = 0;
+ }
+ tokenToEntity.put(tokenID, type);
+ }
+
+ } catch (Exception e) {
+ throw new SAXException("Could not parse the named entity annotation file.\n" +
+ e.getMessage(), e);
+ }
+ }
+
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStream.java
new file mode 100644
index 0000000..dd7c6da
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStream.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.List;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class MascNamedEntitySampleStream extends FilterObjectStream<MascDocument, NameSample> {
+
+ MascDocument buffer;
+
+ /**
+ * Create a stream of named entity samples from a stream of MascDocuments
+ *
+ * @param samples a MascDocumentStream
+ * @throws IOException
+ */
+ public MascNamedEntitySampleStream(ObjectStream<MascDocument> samples) throws IOException {
+ super(samples);
+ try {
+ do {
+ buffer = samples.read();
+ } while (!buffer.hasNamedEntities());
+ } catch (Exception e) {
+ throw new IOException("None of the documents has named entity labels" +
+ e.getMessage());
+ }
+ }
+
+ /**
+ * Get the next sample of named entities.
+ *
+ * @return One sentence together with its named entity annotation
+ * @throws IOException if the sample cannot be extracted
+ */
+ public NameSample read() throws IOException {
+
+ /* Read the documents one sentence at a time
+ If the document is over, move to the next one
+ If both document stream and sentence stream are over, return null
+ */
+ try {
+ MascSentence sentence = buffer.read();
+ while (sentence == null) {
+ buffer = samples.read();
+ if (buffer == null) {
+ return null;
+ }
+ if (buffer.hasNamedEntities()) {
+ sentence = buffer.read();
+ }
+ }
+
+ List<String> tokens = sentence.getTokenStrings();
+ String[] tokensArray = new String[tokens.size()];
+ tokens.toArray(tokensArray);
+
+ List<Span> namedEntities = sentence.getNamedEntities();
+ Span[] namedEntitiesArray = new Span[namedEntities.size()];
+ namedEntities.toArray(namedEntitiesArray);
+
+ //todo: should the user decide about clearAdaptiveData?
+ return new NameSample(tokensArray, namedEntitiesArray, true);
+
+ } catch (IOException e) {
+ throw new IOException("Could not get a sample of named entities from the data.");
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ samples.close();
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ samples.reset();
+ buffer = samples.read();
+ }
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java
new file mode 100644
index 0000000..49bf94b
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascNamedEntitySampleStreamFactory extends AbstractSampleStreamFactory<NameSample> {
+ public static final String MASC_FORMAT = "masc";
+
+ protected <P> MascNamedEntitySampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class,
+ MASC_FORMAT,
+ new opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory(
+ opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory.Parameters.class));
+ }
+
+ @Override
+ public ObjectStream<NameSample> create(String[] args) {
+ opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory.Parameters params =
+ ArgumentParser.parse(args,
+ opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory.Parameters.class);
+
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+ return new MascNamedEntitySampleStream(
+ new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter));
+ } catch (IOException e) {
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+
+ interface Parameters extends BasicFormatParams {
+
+ @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+ description = "search through files recursively")
+ boolean getRecurrentSearch();
+
+ @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+ description = "only include files which contain a given string in their name")
+ String getFileFilter();
+
+ }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStream.java
new file mode 100644
index 0000000..7d7b295
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStream.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.List;
+
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class MascPOSSampleStream extends FilterObjectStream<MascDocument, POSSample> {
+
+ MascDocument buffer;
+
+ /**
+ * Create a stream of POS-samples from a stream of MascDocuments.
+ *
+ * @param samples A MascDocumentStream.
+ * @throws IOException
+ */
+ public MascPOSSampleStream(ObjectStream<MascDocument> samples) throws IOException {
+ super(samples);
+ try {
+ do {
+ buffer = samples.read();
+ } while (!buffer.hasPennTags()); // For now, we'll always use Penn tags
+ } catch (Exception e) {
+ throw new IOException("None of the documents has POS tags" +
+ e.getMessage());
+ }
+ }
+
+ /**
+ * Get the next sample
+ *
+ * @return One sentence together with its POS tags.
+ * @throws IOException if anything goes wrong.
+ */
+ public POSSample read() throws IOException {
+
+ /* Read the documents one sentence at a time
+ If the document is over, move to the next one
+ If both document stream and sentence stream are over, return null
+ */
+ try {
+ MascSentence sentence = buffer.read();
+ while (sentence == null) {
+ buffer = samples.read();
+ if (buffer == null) {
+ return null;
+ }
+ if (buffer.hasPennTags()) {
+ sentence = buffer.read();
+ }
+ }
+
+ List<String> tokens = sentence.getTokenStrings();
+ List<String> POStags = sentence.getTags();
+ return new POSSample(tokens, POStags);
+
+ } catch (IOException e) {
+ throw new IOException("Could not get a sample of POS tags from the data.");
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ samples.close();
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ samples.reset();
+ buffer = samples.read();
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java
new file mode 100644
index 0000000..c3aa216
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> {
+ public static final String MASC_FORMAT = "masc";
+
+ protected <P> MascPOSSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(POSSample.class,
+ MASC_FORMAT,
+ new opennlp.tools.formats.masc.MascPOSSampleStreamFactory(
+ opennlp.tools.formats.masc.MascPOSSampleStreamFactory.Parameters.class));
+ }
+
+ @Override
+ public ObjectStream<POSSample> create(String[] args) {
+ opennlp.tools.formats.masc.MascPOSSampleStreamFactory.Parameters params =
+ ArgumentParser.parse(args,
+ opennlp.tools.formats.masc.MascPOSSampleStreamFactory.Parameters.class);
+
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+ return new MascPOSSampleStream(
+ new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter));
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+
+ interface Parameters extends BasicFormatParams {
+
+ @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+ description = "search through files recursively")
+ boolean getRecurrentSearch();
+
+ @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+ description = "only include files which contain a given string in their name")
+ String getFileFilter();
+
+ }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
new file mode 100644
index 0000000..9ca44a0
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Stack;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A class for parsing MASC's Penn tagging/tokenization stand-off annotation
+ */
+public class MascPennTagParser extends DefaultHandler {
+
+ private Map<Integer, int[]> tokenToQuarks = new HashMap<Integer, int[]>();
+ private Map<Integer, String> tokenToTag = new HashMap<Integer, String>();
+ private Map<Integer, String> tokenToBase = new HashMap<Integer, String>();
+ private Stack<Integer> tokenStack = new Stack();
+ private Stack<Integer> tokenStackTag = new Stack();
+
+ public Map<Integer, String> getTags() {
+ return tokenToTag;
+ }
+
+ public Map<Integer, String> getBases() {
+ return tokenToBase;
+ }
+
+ public Map<Integer, int[]> getTokenToQuarks() {
+ return tokenToQuarks;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ try {
+ //get the link between region and Penn tag
+ if (qName.equals("node")) {
+ tokenStack.push(Integer.parseInt(attributes.getValue("xml:id")
+ .replaceFirst("penn-n", "")));
+ }
+
+ if (qName.equals("link")) {
+ if (tokenStack.isEmpty()) {
+ throw new SAXException("The linking of tokens to quarks is broken.");
+ }
+
+ String[] targets = attributes.getValue("targets")
+ .replaceAll("seg-r", "")
+ .split(" ");
+
+ int[] regions = new int[targets.length];
+ for (int i = 0; i < targets.length; i++) {
+ int region = Integer.parseInt(targets[i]);
+ regions[i] = region;
+ }
+ tokenToQuarks.put(tokenStack.pop(), regions);
+ }
+
+ if (qName.equals("a")) {
+ tokenStackTag.push(Integer.parseInt(attributes.getValue("ref")
+ .replaceFirst("penn-n", "")));
+ }
+
+ if (qName.equals("f")) {
+ String type = attributes.getValue("name");
+ if (tokenStackTag.isEmpty()) {
+ throw new SAXException("The linking of tokens to their tags/bases is broken.");
+ }
+
+ if (type.equals("msd")) {
+ tokenToTag.put(tokenStackTag.peek(), attributes.getValue("value"));
+ } else if (type.equals("base")) {
+ tokenToBase.put(tokenStackTag.peek(), attributes.getValue("value"));
+ }
+ }
+
+ } catch (Exception e) {
+ throw new SAXException("Could not parse the Penn-POS annotation file.\n" + e.getMessage(), e);
+ }
+ }
+
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ // we can forget the current node
+ if (qName.equals("a")) {
+ tokenStackTag.pop();
+ }
+
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java
new file mode 100644
index 0000000..0ba4092
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.util.Span;
+
+public class MascSentence extends Span {
+
+ private class QuarkExtractor {
+
+ private final Map<Integer, MascWord> wordsById;
+ private final List<MascWord> allDocumentWords;
+
+ /**
+ * A helper class to extract the extract a quark from the corpus file even if it is beyond the
+ * bounds of the sentence
+ *
+ * @param wordsById Quarks of the sentence organized by their id
+ * @param allDocumentWords Quarks of the document organized by their id
+ */
+ protected QuarkExtractor(Map<Integer, MascWord> wordsById, List<MascWord> allDocumentWords) {
+ this.wordsById = wordsById;
+ this.allDocumentWords = allDocumentWords;
+ }
+
+ /**
+ * Extract a quark by its key
+ *
+ * @param key The quark's ID
+ * @return The quark reference
+ * @throws IOException if not found in the document
+ */
+ protected MascWord get(int key) throws IOException {
+ //We first check if this word is in the sentence
+ //todo: evaluate the necessity: HashMaps are O(1), right?
+ if (wordsById.containsKey(key)) {
+ return wordsById.get(key);
+ } else {
+ for (MascWord wordFromWholeDocument : allDocumentWords) {
+ if (wordFromWholeDocument.getId() == key) {
+ return wordFromWholeDocument;
+ }
+ }
+ }
+ throw new IOException("Word" + key + " not found in the document.");
+ }
+
+ }
+
+ private final List<MascWord> allDocumentWords;
+ private final String text;
+ private final List<MascWord> words;
+ private final Map<Integer, MascWord> wordsById;
+ private List<MascToken> sentenceTokens = null;
+ private Map<Integer, Integer> tokensById = new HashMap<>();
+ private List<Span> namedEntities = new ArrayList<>();
+
+ /**
+ * Create a MascSentence, containing its associated text and quarks
+ *
+ * @param s Start of the sentence within the corpus file
+ * @param e End of the sentence within the corpus file
+ * @param text The reference to text of the corpus file
+ * @param sentenceQuarks The quarks found in that sentence
+ * @param allQuarks The reference to a list of all quarks in the file
+ */
+ public MascSentence(int s, int e, String text, List<MascWord> sentenceQuarks,
+ List<MascWord> allQuarks) {
+ super(s, e);
+ this.text = text;
+ this.words = sentenceQuarks;
+ this.allDocumentWords = allQuarks;
+
+ // We'll create a map of word ID's and the word ref's to speed up the tokenization
+ HashMap<Integer, MascWord> idToWordMap = new HashMap<>();
+ for (MascWord w : sentenceQuarks) {
+ idToWordMap.put(w.getId(), w);
+ }
+ wordsById = idToWordMap;
+ }
+
+ /**
+ * Add the Penn tokenization and POS tagging to the sentence
+ *
+ * @param tokenToQuarks A map from token ID to quarks in that token
+ * @param quarkToTokens A map of quark IDs and the token IDs containing that quark
+ * @param tokenToBase Token ID to the token base
+ * @param tokenToTag Token ID to the POS tag
+ * @return true if no issue encountered, false if tokens cross sentence boundaries
+ * @throws IOException If anything goes wrong
+ */
+ boolean tokenizePenn(Map<Integer, int[]> tokenToQuarks,
+ Map<Integer, int[]> quarkToTokens,
+ Map<Integer, String> tokenToBase,
+ Map<Integer, String> tokenToTag) throws IOException {
+
+ boolean fileWithoutIssues = true;
+ QuarkExtractor extractor = new QuarkExtractor(wordsById, allDocumentWords);
+ sentenceTokens = new ArrayList<>();
+
+ Map<Integer, Boolean> tokensProcessed = new HashMap<>();
+ for (MascWord w : words) {
+ int currentQuarkId = w.getId();
+ //extract the node to which this word belongs
+ int[] tokens = quarkToTokens.get(currentQuarkId);
+
+ //Only continue, if the word belongs to at least one node
+ if (tokens != null) {
+ for (int token : tokens) {
+ //check if we already have the token
+ if (!tokensProcessed.containsKey(token)) {
+
+ int[] quarksOfToken = tokenToQuarks.get(token); // Get the quark IDs contained in the token
+ if (quarksOfToken == null) {
+ System.err.println("Token without quarks found: " + token);
+ }
+
+ for (int quark : quarksOfToken) {
+ if (!wordsById.containsKey(quark)) {
+ fileWithoutIssues = false;
+ System.out.println("[WARNING] Some tokens cross sentence boundaries." +
+ "\n\tQuark ID: " + quark +
+ "\n\tPenn token ID: " + token);
+ }
+ }
+
+ /*Because there are some quarks which are parts of tokens outside of a sentence
+ We need to check every time if that quark was actually assigned to the sentence
+ If not, we need to extract it manually from the whole document*/
+ MascWord[] quarks = new MascWord[quarksOfToken.length]; //Get the actual quark references
+ for (int currentQuark = 0; currentQuark < quarks.length; currentQuark++) {
+ quarks[currentQuark] = extractor.get(quarksOfToken[currentQuark]);
+ }
+
+ int start = extractor.get(quarksOfToken[0]).getStart();
+ int end = extractor.get(quarksOfToken[quarksOfToken.length - 1]).getEnd();
+
+ //only insert tokens with non-zero length, apparently some of them exist in the corpus
+ if (end - start > 0) {
+ sentenceTokens.add(new MascToken(start, end, token, tokenToTag.get(token),
+ tokenToBase.get(token), quarks));
+ tokensProcessed.put(token, true);
+ }
+ }
+ }
+ }
+ }
+ for (int i = 0; i < sentenceTokens.size(); i++) {
+ MascToken t = sentenceTokens.get(i);
+ tokensById.put(t.getTokenId(), i);
+ }
+
+ sentenceTokens = Collections.unmodifiableList(sentenceTokens);
+ return fileWithoutIssues;
+ }
+
+ /**
+ * Add the named entity annotation to the tokenized sentence
+ *
+ * @param entityIDtoEntityType Maps the named entity ID to its type
+ * @param entityIDsToTokens A list of tokens covered by each named entity
+ * @return true if all went well, false if named entities overlap
+ * @throws IOException if anything goes wrong
+ */
+ boolean addNamedEntities(Map<Integer, String> entityIDtoEntityType,
+ Map<Integer, List<Integer>> entityIDsToTokens) throws IOException {
+ boolean fileWithoutIssues = true;
+ if (sentenceTokens == null) {
+ throw new IOException("Named entity labels provided for un untokenized sentence.");
+ }
+
+ //for each named entity identify its span
+ for (Map.Entry<Integer, List<Integer>> namedEntity : entityIDsToTokens.entrySet()) {
+
+ int entityID = namedEntity.getKey();
+ String type = entityIDtoEntityType.get(entityID);
+
+ List<Integer> tokenIDs = namedEntity.getValue();
+
+ int start = sentenceTokens.size();
+ int end = 0;
+ boolean entityInThisSentence = false;
+ for (int tokenID : tokenIDs) {
+
+ if (tokensById.containsKey(tokenID)) {
+ entityInThisSentence = true;
+ if (tokensById.get(tokenID) < start) {
+ start = tokensById.get(tokenID);
+ }
+ if (tokensById.get(tokenID) > end) {
+ end = tokensById.get(tokenID) + 1;
+ }
+ }
+ }
+
+ if (entityInThisSentence) {
+ namedEntities.add(new Span(start, end, type));
+ }
+
+ }
+
+ Comparator<Span> compareByStart = Comparator.comparingInt(Span::getStart);
+ namedEntities.sort(compareByStart);
+
+ Set<Integer> overlaps = new HashSet();
+ int leftIndex = 0;
+ int rightIndex = leftIndex + 1;
+ while (rightIndex < namedEntities.size()) {
+ Span leftSpan = namedEntities.get(leftIndex);
+ Span rightSpan = namedEntities.get(rightIndex);
+ if (leftSpan.contains(rightSpan) || leftSpan.crosses(rightSpan)) {
+ System.out.println("[WARNING] Named entities overlap. This is forbidden in the OpenNLP." +
+ "\n\tKeeping the longer of them.");
+ if (rightSpan.length() > leftSpan.length()) {
+ overlaps.add(leftIndex);
+ } else {
+ overlaps.add(rightIndex);
+ }
+ fileWithoutIssues = false;
+ rightIndex++;
+ } else {
+ leftIndex++;
+ }
+ }
+
+ if (!fileWithoutIssues) {
+ List<Span> namedEntitiesNoOverlaps = new ArrayList<>();
+ for (int i = 0; i < namedEntities.size() - 1; i++) {
+ if (!overlaps.contains(i)) {
+ namedEntitiesNoOverlaps.add(namedEntities.get(i));
+ }
+ }
+ namedEntities = Collections.unmodifiableList(namedEntitiesNoOverlaps);
+ }
+
+ return fileWithoutIssues;
+ }
+
+ /**
+ * Get the named entities
+ *
+ * @return List of named entities defined as token span, e.g. Span(1,3, "org") for tokens [1,3)
+ */
+ public List<Span> getNamedEntities() {
+ return namedEntities;
+ }
+
+ /**
+ * Get the sentence text
+ *
+ * @return Text of the sentence as defined by the sentence segmentation annotation.
+ */
+ public String getSentDetectText() {
+ return text.substring(getStart(), getEnd());
+ }
+
+ /**
+ * Get the text of the sentence tokens
+ *
+ * @return Text of the sentence as defined by the tokens in it.
+ */
+ public String getTokenText() {
+ if (sentenceTokens.isEmpty()) {
+ return "";
+ }
+ return text.substring(sentenceTokens.get(0).getStart(),
+ sentenceTokens.get(sentenceTokens.size() - 1).getEnd());
+ }
+
+ /**
+ * Get the text of the sentence tokens
+ *
+ * @return The texts of the individual tokens in the sentence
+ */
+ public List<String> getTokenStrings() {
+ List<String> tokenArray = new ArrayList<>();
+ for (MascToken t : sentenceTokens) {
+ tokenArray.add(text.substring(t.getStart(), t.getEnd()));
+ }
+
+ return Collections.unmodifiableList(tokenArray);
+
+ }
+
+ /**
+ * Get the boundaries of individual tokens
+ *
+ * @return Spans representing the tokens of the sentence (according to Penn tokenization)
+ */
+ public List<Span> getTokensSpans() {
+
+ List<Span> tokenSpans = new ArrayList<>();
+ int offset = sentenceTokens.isEmpty() ? 0 : sentenceTokens.get(0).getStart();
+
+ for (MascToken i : sentenceTokens) {
+ tokenSpans.add(new Span(i.getStart() - offset, i.getEnd() - offset));
+ }
+
+ return Collections.unmodifiableList(tokenSpans);
+ }
+
+ /**
+ * Get the tags of tokens in the sentence
+ *
+ * @return A list of individual tags
+ * @throws IOException if used on an untokenized sentence
+ */
+ public List<String> getTags() throws IOException {
+ List<String> tags = new ArrayList<>();
+ for (MascToken t : sentenceTokens) {
+ tags.add(t.getPos());
+ }
+ return tags;
+ }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceParser.java
new file mode 100644
index 0000000..7a679a0
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceParser.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A class to parse the sentence segmentation stand-off annotation
+ */
+class MascSentenceParser extends DefaultHandler {
+
+ private List<Span> sentenceAnchors = null;
+
+ public List<Span> getAnchors() {
+ return sentenceAnchors;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ try {
+ // create a sentence and put it into the list of sentences
+ if (qName.equalsIgnoreCase("region")) {
+ String[] anchors = attributes.getValue("anchors").split(" ");
+
+ int left = Integer.parseInt(anchors[0]);
+ int right = Integer.parseInt(anchors[1]);
+
+ // initialize list
+ if (sentenceAnchors == null) {
+ sentenceAnchors = new ArrayList<Span>();
+ }
+
+ sentenceAnchors.add(new Span(left, right));
+ }
+
+ } catch (Exception e) {
+ throw new SAXException("Could not parse the sentence annotation file.");
+ }
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStream.java
new file mode 100644
index 0000000..7e8a5db
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStream.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class MascSentenceSampleStream extends FilterObjectStream<MascDocument, SentenceSample> {
+
+ private final int sentencesPerSample;
+ private MascDocument buffer;
+
+ public MascSentenceSampleStream(ObjectStream<MascDocument> samples, int sentencesPerSample)
+ throws IOException {
+ super(samples);
+ this.sentencesPerSample = sentencesPerSample;
+ buffer = samples.read();
+ }
+
+ /**
+ * Reads a new sample of sentences
+ *
+ * @return The specified number of sentences. If fewer left, then return whatever is left.
+ * @throws IOException
+ */
+ @Override
+ public SentenceSample read() throws IOException {
+
+ try {
+ StringBuilder documentText = new StringBuilder();
+ List<Span> sentenceSpans = new ArrayList<>();
+
+ for (int i = 0; i < sentencesPerSample; i++) {
+ MascSentence sentence = buffer.read();
+ if (sentence != null) {
+ // Current document still has sentences
+ int startIndex = documentText.length();
+ documentText.append(sentence.getSentDetectText()).append(' ');
+ sentenceSpans.add(new Span(startIndex, documentText.length() - 1));
+ } else if ((buffer = samples.read()) != null) {
+ documentText.append('\n');
+ // Current document exhausted, but we can still move on to the next one
+ i--; // This round does not count
+ } else {
+ // We exhausted all sentences in all documents
+ break;
+ }
+ }
+
+ if (documentText.length() > 0) {
+ documentText.setLength(documentText.length() - 1);
+ return new SentenceSample(documentText,
+ sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+ }
+
+ return null;
+ } catch (IOException e) {
+ throw new IOException("You are reading an empty document stream. " +
+ "Did you close it?");
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ samples.close();
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ samples.reset();
+ buffer = samples.read();
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..a445167
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+ public static final String MASC_FORMAT = "masc";
+
+ protected <P> MascSentenceSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(SentenceSample.class,
+ MASC_FORMAT,
+ new opennlp.tools.formats.masc.MascSentenceSampleStreamFactory(
+ opennlp.tools.formats.masc.MascSentenceSampleStreamFactory.Parameters.class));
+ }
+
+ @Override
+ public ObjectStream<SentenceSample> create(String[] args) {
+ opennlp.tools.formats.masc.MascSentenceSampleStreamFactory.Parameters params =
+ ArgumentParser.parse(args,
+ opennlp.tools.formats.masc.MascSentenceSampleStreamFactory.Parameters.class);
+
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+ return new MascSentenceSampleStream(
+ new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter),
+ Integer.parseInt(params.getSentencesPerSample()));
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+
+ interface Parameters extends BasicFormatParams {
+ @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+ description = "number of sentences per sample")
+ String getSentencesPerSample();
+
+ @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+ description = "search through files recursively")
+ boolean getRecurrentSearch();
+
+ @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+ description = "only include files which contain a given string in their name")
+ String getFileFilter();
+
+ }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java
new file mode 100644
index 0000000..5d17a40
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import opennlp.tools.util.Span;
+
+public class MascToken extends Span {
+
+ private final String pos;
+ private final String base;
+ private final int tokenId;
+ private final MascWord[] quarks;
+
+ /**
+ * Create a MascToken, which may combine multiple quarks
+ *
+ * @param s The start of the token in the corpus file
+ * @param e The end of the token in the corpus file
+ * @param pennId The ID of the token as assigned by the Penn stand-off annotation
+ * @param pos The POS-tag
+ * @param base The base form
+ * @param quarks Quarks contained in the token
+ */
+ public MascToken(int s, int e, int pennId, String pos, String base, MascWord[] quarks) {
+ super(s, e);
+ this.pos = pos;
+ this.base = base;
+ this.tokenId = pennId;
+ this.quarks = quarks;
+ }
+
+ /**
+ * Get ID of the token
+ *
+ * @return the ID
+ */
+ public int getTokenId() {
+ return tokenId;
+ }
+
+ /**
+ * Get the base form
+ *
+ * @return the base form
+ */
+ public String getBase() {
+ return base;
+ }
+
+ /**
+ * Get the POS tag
+ *
+ * @return POS tag
+ */
+ public String getPos() {
+ return pos;
+ }
+
+ /**
+ * Get quarks of the token
+ *
+ * @return Array of quark references
+ */
+ public MascWord[] getQuarks() {
+ return quarks;
+ }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStream.java
new file mode 100644
index 0000000..93fd21d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStream.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.List;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class MascTokenSampleStream extends FilterObjectStream<MascDocument, TokenSample> {
+
+ MascDocument buffer;
+
+ public MascTokenSampleStream(ObjectStream<MascDocument> samples) throws IOException {
+ super(samples);
+ try {
+ do {
+ buffer = samples.read();
+ } while (!buffer.hasPennTags()); // For now, we only use Penn tokenization
+ } catch (Exception e) {
+ throw new IOException("None of the documents has Penn tokenization" +
+ e.getMessage());
+ }
+ }
+
+ public TokenSample read() throws IOException {
+
+ /* Read the documents one sentence at a time
+ If the document is over, move to the next one
+ If both document stream and sentence stream are over, return null
+ */
+ try {
+ boolean sentenceFound = true;
+ String sentenceString;
+ List<Span> tokensSpans;
+ MascSentence sentence;
+ do {
+ sentence = buffer.read();
+ while (sentence == null) {
+ buffer = samples.read();
+ if (buffer == null) {
+ return null;
+ }
+ if (buffer.hasPennTags()) {
+ sentence = buffer.read();
+ }
+ }
+
+ sentenceString = sentence.getTokenText();
+ tokensSpans = sentence.getTokensSpans();
+
+ if (sentenceString.length() == 0) {
+ System.err.println("[WARNING] Zero sentence found: " +
+ "there is a sentence without any tokens.");
+ System.err.println(sentenceString);
+ System.err.println(tokensSpans.toString());
+ sentenceFound = false;
+ }
+
+ for (int i = 0; i < tokensSpans.size(); i++) {
+ Span t = tokensSpans.get(i);
+ if (t.getEnd() - t.getStart() == 0) {
+ System.err.println("[WARNING] Zero token found: " +
+ "there is a token without any quarks.");
+ System.err.println(sentenceString);
+ System.err.println(tokensSpans.toString());
+ sentenceFound = false;
+ }
+ }
+
+
+ } while (!sentenceFound);
+
+ Span[] tokensSpansArray = new Span[tokensSpans.size()];
+ tokensSpans.toArray(tokensSpansArray);
+
+ return new TokenSample(sentenceString, tokensSpansArray);
+
+ } catch (IOException e) {
+ throw new IOException("Could not get a sample of tokens from the data.");
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ samples.close();
+ }
+
+ @Override
+ public void reset() throws IOException, UnsupportedOperationException {
+ samples.reset();
+ buffer = samples.read();
+ }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java
new file mode 100644
index 0000000..655be15
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascTokenSampleStreamFactory extends AbstractSampleStreamFactory<TokenSample> {
+
+ public static final String MASC_FORMAT = "masc";
+
+
+ protected <P> MascTokenSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(TokenSample.class,
+ MASC_FORMAT,
+ new opennlp.tools.formats.masc.MascTokenSampleStreamFactory(
+ opennlp.tools.formats.masc.MascTokenSampleStreamFactory.Parameters.class));
+ }
+
+
+ @Override
+ public ObjectStream<TokenSample> create(String[] args) {
+ opennlp.tools.formats.masc.MascTokenSampleStreamFactory.Parameters params =
+ ArgumentParser.parse(args,
+ opennlp.tools.formats.masc.MascTokenSampleStreamFactory.Parameters.class);
+
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+ return new MascTokenSampleStream(
+ new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter));
+ } catch (IOException e) {
+ // That will throw an exception
+ CmdLineUtil.handleCreateObjectStreamError(e);
+ }
+ return null;
+ }
+
+ interface Parameters extends BasicFormatParams {
+ @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+ description = "number of sentences per sample")
+ String getSentencesPerSample();
+
+ @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+ description = "search through files recursively")
+ boolean getRecurrentSearch();
+
+ @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+ description = "only include files which contain a given string in their name")
+ String getFileFilter();
+
+ }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java
new file mode 100644
index 0000000..a75dce7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import opennlp.tools.util.Span;
+
+public class MascWord extends Span {
+
+ private final int id;
+
+ /**
+ * Saves one of MASC's quarks - basic-level units (may be sub-word)
+ *
+ * @param s The beginning of the word in the corpus file
+ * @param e The end of the word in the corpus file
+ * @param id The id as assigned by the stand-off annotation
+ */
+ public MascWord(int s, int e, int id) {
+ super(s, e);
+ this.id = id;
+ }
+
+ public int getId() {
+ return id;
+ }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWordParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWordParser.java
new file mode 100644
index 0000000..db57f82
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWordParser.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class to parse the word ("quark") segmentation stand-off annotation
+ */
+class MascWordParser extends DefaultHandler {
+
+ private List<MascWord> wordAnchors = null;
+
+ public List<MascWord> getAnchors() {
+ return wordAnchors;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ try {
+ // create a word and put it into the list of words
+ if (qName.equalsIgnoreCase("region")) {
+ int id = Integer.parseInt(attributes.getValue("xml:id").replaceFirst("seg-r", ""));
+ String[] anchors = attributes.getValue("anchors").split(" ");
+
+ int left = Integer.parseInt(anchors[0]);
+ int right = Integer.parseInt(anchors[1]);
+
+ // initialize list
+ if (wordAnchors == null) {
+ wordAnchors = new ArrayList<MascWord>();
+ }
+
+ wordAnchors.add(new MascWord(left, right, id));
+ }
+
+ } catch (Exception e) {
+ throw new SAXException("Could not parse the word segmentation annotation file.");
+ }
+ }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamTest.java
new file mode 100644
index 0000000..2f11150
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+
+import org.junit.Test;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.TokenNameFinderEvaluator;
+import opennlp.tools.namefind.TokenNameFinderFactory;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class MascNamedEntitySampleStreamTest {
+
+ @Test
+ public void read() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascNamedEntitySampleStream stream;
+ stream = new MascNamedEntitySampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ NameSample s = stream.read();
+
+ String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+ assertArrayEquals(expectedTokens, s.getSentence());
+
+ Span[] expectedTags = new Span[] {new Span(4, 5, "org")};
+ Span[] returnedTags = s.getNames();
+ // check the start/end positions
+ assertEquals(expectedTags.length, returnedTags.length);
+ for (int i = 0; i < returnedTags.length; i++) {
+ assertTrue(expectedTags[i].equals(returnedTags[i]));
+ }
+
+ s = stream.read();
+ expectedTokens = new String[] {"This", "is", "'nother", "test", "sentence", "."};
+ assertArrayEquals(expectedTokens, s.getSentence());
+
+ expectedTags = new Span[] {};
+ returnedTags = s.getNames();
+ assertArrayEquals(expectedTags, returnedTags);
+
+ } catch (IOException e) {
+ fail("IO Exception: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void close() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascNamedEntitySampleStream stream;
+ stream = new MascNamedEntitySampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ stream.close();
+ NameSample s = stream.read();
+ } catch (IOException e) {
+ assertEquals(e.getMessage(),
+ "You are reading an empty document stream. " +
+ "Did you close it?");
+ }
+ }
+
+ @Test
+ public void reset() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascNamedEntitySampleStream stream;
+ stream = new MascNamedEntitySampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ NameSample s = stream.read();
+ s = stream.read();
+ s = stream.read();
+ assertNull(s); //The stream should be exhausted by now
+
+ stream.reset();
+
+ s = stream.read();
+ String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+ assertArrayEquals(expectedTokens, s.getSentence());
+
+ Span[] expectedTags = new Span[] {new Span(4, 5, "org")};
+ Span[] returnedTags = s.getNames();
+ // check the start/end positions
+ assertEquals(expectedTags.length, returnedTags.length);
+ for (int i = 0; i < returnedTags.length; i++) {
+ assertTrue(expectedTags[i].equals(returnedTags[i]));
+ }
+
+ } catch (IOException e) {
+ fail("IO Exception: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void train() {
+ try {
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ FileFilter fileFilter = pathname -> pathname.getName().contains("");
+ ObjectStream<NameSample> trainSample = new MascNamedEntitySampleStream(
+ new MascDocumentStream(directory,
+ true, fileFilter));
+
+ System.out.println("Training");
+ TokenNameFinderModel model = null;
+ TrainingParameters trainingParameters = new TrainingParameters();
+ trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 100);
+
+ model = NameFinderME.train("en", null, trainSample,
+ trainingParameters, new TokenNameFinderFactory());
+
+ ObjectStream<NameSample> testNames = new MascNamedEntitySampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+ TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
+ evaluator.evaluate(testNames);
+
+ System.out.println(evaluator.getFMeasure());
+
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ StackTraceElement[] traces = e.getStackTrace();
+ for (StackTraceElement trace : traces) {
+ System.err.println(trace.toString());
+ }
+ fail("Exception raised");
+ }
+ }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamTest.java
new file mode 100644
index 0000000..4eba1c7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamTest.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import opennlp.tools.postag.POSEvaluator;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+public class MascPOSSampleStreamTest {
+
+ @Test
+ public void read() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascPOSSampleStream stream;
+ stream = new MascPOSSampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ POSSample s = stream.read();
+
+ String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+ assertArrayEquals(expectedTokens, s.getSentence());
+
+ String[] expectedTags = {"DT", "VB", "AT", "NN", "NN", "."};
+ assertArrayEquals(expectedTags, s.getTags());
+
+ s = stream.read();
+ expectedTokens = new String[] {"This", "is", "'nother", "test", "sentence", "."};
+ assertArrayEquals(expectedTokens, s.getSentence());
+
+ expectedTags = new String[] {"DT", "VB", "RB", "NN", "NN", "."};
+ assertArrayEquals(expectedTags, s.getTags());
+ } catch (IOException e) {
+ fail("IO Exception: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void close() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascPOSSampleStream stream;
+ stream = new MascPOSSampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ stream.close();
+ POSSample s = stream.read();
+ } catch (IOException e) {
+ assertEquals(e.getMessage(),
+ "You are reading an empty document stream. " +
+ "Did you close it?");
+ }
+ }
+
+ @Test
+ public void reset() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascPOSSampleStream stream;
+ stream = new MascPOSSampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ POSSample s = stream.read();
+ s = stream.read();
+ s = stream.read();
+ assertNull(s); //The stream should be exhausted by now
+
+ stream.reset();
+
+ s = stream.read();
+
+ String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+ assertArrayEquals(expectedTokens, s.getSentence());
+
+ String[] expectedTags = {"DT", "VB", "AT", "NN", "NN", "."};
+ assertArrayEquals(expectedTags, s.getTags());
+
+ } catch (IOException e) {
+ fail("IO Exception: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void train() {
+ try {
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ FileFilter fileFilter = pathname -> pathname.getName().contains("");
+ ObjectStream<POSSample> trainPOS = new MascPOSSampleStream(
+ new MascDocumentStream(directory,
+ true, fileFilter));
+
+ System.out.println("Training");
+ POSModel model = null;
+ TrainingParameters trainingParameters = new TrainingParameters();
+ trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 20);
+
+ model = POSTaggerME.train("en", trainPOS,
+ trainingParameters, new POSTaggerFactory());
+
+ ObjectStream<POSSample> testPOS = new MascPOSSampleStream(new MascDocumentStream(directory,
+ true, fileFilter));
+ POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model));
+ evaluator.evaluate(testPOS);
+ System.out.println("Accuracy: " + evaluator.getWordAccuracy());
+ System.out.println("Words: " + evaluator.getWordCount());
+
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ System.err.println(Arrays.toString(e.getStackTrace()));
+ fail("Exception raised");
+ }
+
+
+ }
+
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamTest.java
new file mode 100644
index 0000000..6298273
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamTest.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import opennlp.tools.sentdetect.SentenceDetectorEvaluator;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+public class MascSentenceSampleStreamTest {
+
+ @Test
+ public void reset() {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ try {
+ MascSentenceSampleStream stream = new MascSentenceSampleStream(
+ new MascDocumentStream(directory, true, fileFilter), 2);
+
+ //exhaust the fake file
+ SentenceSample testSample = stream.read();
+
+ //now we should get null
+ testSample = stream.read();
+ assertNull(testSample);
+
+ //by resetting, we should get good results again
+ stream.reset();
+ testSample = stream.read();
+ assertNotNull(testSample);
+
+ String documentText = "This is a test Sentence. This is 'nother test sentence. ";
+ List<Span> sentenceSpans = new ArrayList<>();
+ sentenceSpans.add(new Span(0, 24));
+ sentenceSpans.add(new Span(25, 55));
+ SentenceSample expectedSample = new SentenceSample(documentText,
+ sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+
+ assertEquals(testSample.toString(), expectedSample.toString());
+
+ } catch (IOException e) {
+ fail("IO Exception");
+ }
+ }
+
+ @Test
+ public void close() {
+
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascSentenceSampleStream stream;
+ stream = new MascSentenceSampleStream(
+ new MascDocumentStream(directory, true, fileFilter), 2);
+ stream.close();
+ stream.read();
+ } catch (IOException e) {
+ assertEquals(e.getMessage(),
+ "You are reading an empty document stream. " +
+ "Did you close it?");
+ }
+ }
+
+ @Test
+ public void read() {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("");
+ File directory = new File(this.getClass().getResource("/opennlp/tools/formats/masc").getFile());
+ try {
+ MascSentenceSampleStream stream = new MascSentenceSampleStream(
+ new MascDocumentStream(directory, true, fileFilter), 2);
+
+ String documentText = "This is a test Sentence. This is 'nother test sentence. ";
+ List<Span> sentenceSpans = new ArrayList<>();
+ sentenceSpans.add(new Span(0, 24));
+ sentenceSpans.add(new Span(25, 55));
+
+ SentenceSample expectedSample = new SentenceSample(documentText,
+ sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+ SentenceSample testSample = stream.read();
+ assertEquals(testSample.toString(), expectedSample.toString());
+
+ //the fake file is exhausted, we should get null now
+ testSample = stream.read();
+ assertNull(testSample);
+
+ } catch (IOException e) {
+ System.out.println(e.getMessage());
+ System.out.println(Arrays.toString(e.getStackTrace()));
+ fail("IO Exception");
+ }
+
+ }
+
+ @Ignore //todo: We can't train on the FakeMasc data, it is too small.
+ @Test
+ public void train() {
+ try {
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ FileFilter fileFilter = pathname -> pathname.getName().contains("");
+ ObjectStream<SentenceSample> trainSentences = new MascSentenceSampleStream(
+ new MascDocumentStream(directory,
+ true, fileFilter), 1);
+
+ System.out.println("Training");
+ SentenceModel model = null;
+ TrainingParameters trainingParameters = new TrainingParameters();
+ trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 20);
+
+ model = SentenceDetectorME.train("en", trainSentences,
+ new SentenceDetectorFactory(), trainingParameters);
+
+ ObjectStream<SentenceSample> testPOS = new MascSentenceSampleStream(
+ new MascDocumentStream(directory, true, fileFilter), 1);
+ SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(
+ new SentenceDetectorME(model));
+ evaluator.evaluate(testPOS);
+ System.out.println(evaluator.getFMeasure());
+
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ System.err.println(Arrays.toString(e.getStackTrace()));
+ fail("Exception raised");
+ }
+
+
+ }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamTest.java
new file mode 100644
index 0000000..ec2fbe1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamTest.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.tokenize.TokenizerEvaluator;
+import opennlp.tools.tokenize.TokenizerFactory;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+public class MascTokenSampleStreamTest {
+
+ @Test
+ public void read() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascTokenSampleStream stream;
+ stream = new MascTokenSampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ TokenSample s = stream.read();
+
+ String expectedString = "This is a test Sentence.";
+ assertEquals(expectedString, s.getText());
+
+ Span[] expectedTags = {
+ new Span(0, 4),
+ new Span(5, 7),
+ new Span(8, 9),
+ new Span(10, 14),
+ new Span(15, 23),
+ new Span(23, 24)};
+ assertArrayEquals(expectedTags, s.getTokenSpans());
+
+ s = stream.read();
+ String expectedTokens = "This is 'nother test sentence.";
+ assertEquals(expectedTokens, s.getText());
+
+ expectedTags = new Span[] {
+ new Span(0, 4),
+ new Span(5, 7),
+ new Span(8, 15),
+ new Span(16, 20),
+ new Span(21, 29),
+ new Span(29, 30)};
+ assertArrayEquals(expectedTags, s.getTokenSpans());
+ } catch (IOException e) {
+ fail("IO Exception: " + e.getMessage());
+ }
+ }
+
+ @Test
+ public void close() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascTokenSampleStream stream;
+ stream = new MascTokenSampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ stream.close();
+ TokenSample s = stream.read();
+ } catch (IOException e) {
+ assertEquals(e.getMessage(),
+ "You are reading an empty document stream. " +
+ "Did you close it?");
+ }
+ }
+
+ @Test
+ public void reset() {
+ try {
+ FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ MascTokenSampleStream stream;
+ stream = new MascTokenSampleStream(
+ new MascDocumentStream(directory, true, fileFilter));
+
+ TokenSample s = stream.read();
+ s = stream.read();
+ s = stream.read();
+ assertNull(s); //The stream should be exhausted by now
+
+ stream.reset();
+
+ s = stream.read();
+
+ String expectedString = "This is a test Sentence.";
+ assertEquals(expectedString, s.getText());
+
+ Span[] expectedTags = {
+ new Span(0, 4),
+ new Span(5, 7),
+ new Span(8, 9),
+ new Span(10, 14),
+ new Span(15, 23),
+ new Span(23, 24)};
+ assertArrayEquals(expectedTags, s.getTokenSpans());
+
+ } catch (IOException e) {
+ fail("IO Exception: " + e.getMessage());
+ }
+ }
+
+
+ @Test
+ public void train() {
+ try {
+ File directory = new File(this.getClass().getResource(
+ "/opennlp/tools/formats/masc/").getFile());
+ FileFilter fileFilter = pathname -> pathname.getName().contains("");
+ ObjectStream<TokenSample> trainTokens = new MascTokenSampleStream(
+ new MascDocumentStream(directory,
+ true, fileFilter));
+
+ System.out.println("Training");
+ TokenizerModel model = null;
+ TrainingParameters trainingParameters = new TrainingParameters();
+ trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 20);
+
+ model = TokenizerME.train(trainTokens, new TokenizerFactory("en", null, false, null),
+ trainingParameters);
+
+ ObjectStream<TokenSample> testTokens = new MascTokenSampleStream(
+ new MascDocumentStream(directory,
+ true, fileFilter));
+ TokenizerEvaluator evaluator = new TokenizerEvaluator(new TokenizerME(model));
+ evaluator.evaluate(testTokens);
+ System.out.println(evaluator.getFMeasure());
+
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ System.err.println(Arrays.toString(e.getStackTrace()));
+ fail("Exception raised");
+ }
+
+
+ }
+
+
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-ne.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-ne.xml
new file mode 100644
index 0000000..3a6caf3
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-ne.xml
@@ -0,0 +1,20 @@
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+ <graphHeader>
+ <labelsDecl>
+ <labelUsage label="person" occurs="1"/>
+ </labelsDecl>
+ <dependencies>
+ <dependsOn f.id="f.penn"/>
+ </dependencies>
+ <annotationSpaces>
+ <annotationSpace as.id="anc"/>
+ </annotationSpaces>
+ </graphHeader>
+ <node xml:id="ne-n0"/>
+ <a xml:id="ne-N65579" label="org" ref="ne-n0" as="anc">
+ <fs>
+ <f name="type" value="person"/>
+ </fs>
+ </a>
+ <edge xml:id="ne-lnk1" from="ne-n0" to="penn-n4"/>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-penn.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-penn.xml
new file mode 100644
index 0000000..2be448a
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-penn.xml
@@ -0,0 +1,145 @@
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+ <graphHeader>
+ <labelsDecl>
+ <labelUsage label="tok" occurs="12"/>
+ </labelsDecl>
+ <dependencies>
+ <dependsOn f.id="f.seg"/>
+ </dependencies>
+ <annotationSpaces>
+ <annotationSpace as.id="anc"/>
+ </annotationSpaces>
+ </graphHeader>
+
+ <node xml:id="penn-n0">
+ <link targets="seg-r0"/>
+ </node>
+ <a xml:id="penn-N65571" label="tok" ref="penn-n0" as="anc">
+ <fs>
+ <f name="base" value="this"/>
+ <f name="msd" value="DT"/>
+ <f name="string" value="This"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n1">
+ <link targets="seg-r2"/>
+ </node>
+ <a xml:id="penn-N65599" label="tok" ref="penn-n1" as="anc">
+ <fs>
+ <f name="base" value="is"/>
+ <f name="msd" value="VB"/>
+ <f name="string" value="is"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n10">
+ <link targets="seg-r20"/>
+ </node>
+ <a xml:id="penn-N65847" label="tok" ref="penn-n10" as="anc">
+ <fs>
+ <f name="base" value="sentence"/>
+ <f name="msd" value="NN"/>
+ <f name="string" value="sentence"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n11">
+ <link targets="seg-r21"/>
+ </node>
+ <a xml:id="penn-N65875" label="tok" ref="penn-n11" as="anc">
+ <fs>
+ <f name="base" value="."/>
+ <f name="msd" value="."/>
+ <f name="string" value="."/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n2">
+ <link targets="seg-r4"/>
+ </node>
+ <a xml:id="penn-N65627" label="tok" ref="penn-n2" as="anc">
+ <fs>
+ <f name="base" value="a"/>
+ <f name="msd" value="AT"/>
+ <f name="string" value="a"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n3">
+ <link targets="seg-r6"/>
+ </node>
+ <a xml:id="penn-N65655" label="tok" ref="penn-n3" as="anc">
+ <fs>
+ <f name="base" value="sample"/>
+ <f name="msd" value="NN"/>
+ <f name="string" value="sample"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n4">
+ <link targets="seg-r8"/>
+ </node>
+ <a xml:id="penn-N65683" label="tok" ref="penn-n4" as="anc">
+ <fs>
+ <f name="base" value="sentence"/>
+ <f name="msd" value="NN"/>
+ <f name="string" value="Sentence"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n5">
+ <link targets="seg-r9"/>
+ </node>
+ <a xml:id="penn-N65711" label="tok" ref="penn-n5" as="anc">
+ <fs>
+ <f name="string" value="."/>
+ <f name="msd" value="."/>
+ <f name="base" value="."/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n6">
+ <link targets="seg-r11"/>
+ </node>
+ <a xml:id="penn-N65739" label="tok" ref="penn-n6" as="anc">
+ <fs>
+ <f name="base" value="this"/>
+ <f name="msd" value="DT"/>
+ <f name="string" value="This"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n7">
+ <link targets="seg-r13"/>
+ </node>
+ <a xml:id="penn-N65767" label="tok" ref="penn-n7" as="anc">
+ <fs>
+ <f name="base" value="is"/>
+ <f name="msd" value="VB"/>
+ <f name="string" value="is"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n8">
+ <link targets="seg-r15 seg-r16"/>
+ </node>
+ <a xml:id="penn-N65795" label="tok" ref="penn-n8" as="anc">
+ <fs>
+ <f name="base" value="'nother"/>
+ <f name="msd" value="RB"/>
+ <f name="string" value="'nother"/>
+ </fs>
+ </a>
+
+ <node xml:id="penn-n9">
+ <link targets="seg-r18"/>
+ </node>
+ <a xml:id="penn-N65819" label="tok" ref="penn-n9" as="anc">
+ <fs>
+ <f name="base" value="sample"/>
+ <f name="msd" value="NN"/>
+ <f name="string" value="sample"/>
+ </fs>
+ </a>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-s.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-s.xml
new file mode 100644
index 0000000..bb124dc
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-s.xml
@@ -0,0 +1,29 @@
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+ <graphHeader>
+ <labelsDecl>
+ <labelUsage label="s" occurs="2"/>
+ </labelsDecl>
+ <annotationSpaces>
+ <annotationSpace as.id="anc"/>
+ </annotationSpaces>
+ </graphHeader>
+ <region xml:id="s-r0" anchors="1 25"/>
+ <region xml:id="s-r1" anchors="27 57"/>
+
+ <node xml:id="s-n0">
+ <link targets="s-r0"/>
+ </node>
+ <a xml:id="s-N65697" label="s" ref="s-n0" as="anc">
+ <fs>
+ <f name="id" value="s0.1"/>
+ </fs>
+ </a>
+ <node xml:id="s-n1">
+ <link targets="s-r1"/>
+ </node>
+ <a xml:id="s-N65717" label="s" ref="s-n1" as="anc">
+ <fs>
+ <f name="id" value="p1s1"/>
+ </fs>
+ </a>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-seg.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-seg.xml
new file mode 100644
index 0000000..a5f33fa
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-seg.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+ <graphHeader>
+ <labelsDecl>
+ </labelsDecl>
+ </graphHeader>
+ <region xml:id="seg-r0" anchors="1 5"/>
+ <region xml:id="seg-r2" anchors="6 8"/>
+ <region xml:id="seg-r4" anchors="9 10"/>
+ <region xml:id="seg-r6" anchors="11 15"/>
+ <region xml:id="seg-r8" anchors="16 24"/>
+ <region xml:id="seg-r9" anchors="24 25"/>
+ <region xml:id="seg-r11" anchors="27 31"/>
+ <region xml:id="seg-r13" anchors="32 34"/>
+ <region xml:id="seg-r15" anchors="35 36"/>
+ <region xml:id="seg-r16" anchors="36 42"/>
+ <region xml:id="seg-r18" anchors="43 47"/>
+ <region xml:id="seg-r20" anchors="48 56"/>
+ <region xml:id="seg-r21" anchors="56 57"/>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.hdr b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.hdr
new file mode 100644
index 0000000..31778d2
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.hdr
@@ -0,0 +1,37 @@
+<documentHeader xmlns="http://www.xces.org/ns/GrAF/1.0/" docId="MASCF-00046" creator="JZ" date.created="2019-08-27"
+ version="1.0.4">
+ <fileDesc>
+ <titleStmt>
+ <title>FakeMASC</title>
+ </titleStmt>
+ <extent count="10" unit="word"/>
+ <sourceDesc>
+ <title>A fake MASC file to test the OpenNLP MASC-Format tools</title>
+ <publisher>None</publisher>
+ <eAddress type="web">http://github.com/</eAddress>
+ <pubPlace>http://github.com/apache/opennlp</pubPlace>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass catRef="SP TR ">
+ <domain>None</domain>
+ <subdomain>None at all</subdomain>
+ <subject>Unit test</subject>
+ <audience>Adult</audience>
+ </textClass>
+ <primaryData f.id="f.text" loc="fakeMASC.txt"/>
+ <annotations>
+ <annotation loc="fakeMASC-s.xml" f.id="f.s">sentence boundaries</annotation>
+ <annotation loc="fakeMASC-seg.xml" f.id="f.seg">word segments: quarks</annotation>
+ <annotation loc="fakeMASC-penn.xml" f.id="f.penn">Penn POS tags</annotation>
+ <annotation loc="fakeMASC-ne.xml" f.id="f.ne">Named entity annotation</annotation>
+ </annotations>
+ </profileDesc>
+ <revisionDesc>
+ <change>
+ <changeDate>2019-08-27</changeDate>
+ <respName>Jiri Zamecnik</respName>
+ <item>Created the fake file to test the standoff markup</item>
+ </change>
+ </revisionDesc>
+</documentHeader>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.txt
new file mode 100644
index 0000000..0428dc4
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.txt
@@ -0,0 +1 @@
+ This is a test Sentence. This is 'nother test sentence.
\ No newline at end of file